Set wd & load packages & read in data

setwd("/Users/kelseyschoenemann/Desktop/Code_Datasets")

library(readxl) #read in data
library(readr)
library(tidyverse) #manipulate the data
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) #plot data
# install.packages("devtools")
# devtools::install_github("r-lib/conflicted")
library(conflicted) #https://conflicted.r-lib.org
conflicts_prefer(dplyr::filter)
## [conflicted] Will prefer dplyr::filter over any other package.

CapLog

#read in capture log of queens identity and behavior
CapLog <- read.csv("CaptureLog.csv")
CapLog[,c(13:15)][is.na(CapLog[,c(13:15)])] <- 0 #replacing NAs with 0s
CapLog$Bombus_spp<-as.factor(CapLog$Bombus_spp)
CapLog$BeeID<-as.factor(CapLog$BeeID)
CapLog$SiteID<-as.factor(CapLog$SiteID)
CapLog$Date<-as.POSIXct(CapLog$Date, format="%m/%e/%y")

SitDat

#read in site data about survey conditions
SitDat <- read.csv("SiteData.csv")
SitDat[,c(10:13)][is.na(SitDat[,c(10:13)])] <- 0 #replacing NAs with 0s
SitDat$SiteID<-as.factor(SitDat$SiteID)

Effort

#read in survey effort data about search times by habitat and observer
Effort <- read.csv("SurveyEffort.csv")
Effort$SiteID<-as.factor(Effort$SiteID)

FlorSpp

FlorSpp <- read.csv("FloweringSpp.csv")
FlorSpp$SiteID<-as.factor(FlorSpp$SiteID)

LandUse

INFO about zonal statistics (i.e., how exactly the data I’m using was tabulated): https://gis.stackexchange.com/questions/276794/how-does-qgis-zonal-statistics-handle-partially-overlapping-pixels

#read in land use data extracted near survey sites
SitesLandUse <-  read.csv("export-Sites.csv") 
SitesLandUse <- SitesLandUse[,-c(1,5)] #remove columns that aren't Name, SiteID, Type, County or land use columns
SitesLandUse[,7:21][is.na(SitesLandUse[,7:21])] <- 0 #replace NAs in land use columns with 0

# rename columns

#LAND USE CLASSES
#NLCD
#   ClassID Classification
#   11  Open Water
#   21  Developed, Open Space
#   22  Developed, Low Intensity
#   23  Developed, Medium Intensity 
#   24  Developed High Intensity
#   31  Barren Land (Rock/Sand/Clay) 
#   41  Deciduous Forest
#   42  Evergreen Forest
#   43  Mixed Forest
#   52  Shrub/Scrub
#   71  Grassland/Herbaceous
#   81  Pasture/Hay
#   82  Cultivated Crops 
#   90  Woody Wetlands
#   95  Emergent Herbaceous Wetlands

SitesLandUse2 <- SitesLandUse %>% rename(
'NLCD_Water1000'='NLCD_Sites_1000m_11',
'NLCD_DevelopedOpen1000'='NLCD_Sites_1000m_21',
'NLCD_DevelopedLow1000'='NLCD_Sites_1000m_22',
'NLCD_DevelopedMed1000'='NLCD_Sites_1000m_23',
'NLCD_DevelopedHigh1000'='NLCD_Sites_1000m_24',
'NLCD_Barren1000'='NLCD_Sites_1000m_31',
'NLCD_DeciduousForest1000'='NLCD_Sites_1000m_41',
'NLCD_EvergreenForest1000'='NLCD_Sites_1000m_42',
'NLCD_MixedForest1000'='NLCD_Sites_1000m_43',
'NLCD_ShrubScrub1000'='NLCD_Sites_1000m_52',
'NLCD_Grassland1000'='NLCD_Sites_1000m_71',
'NLCD_PastureHay1000'='NLCD_Sites_1000m_81',
'NLCD_CultivatedCrops1000'='NLCD_Sites_1000m_82',
'NLCD_WoodyWetlands1000'='NLCD_Sites_1000m_90',
'NLCD_HerbaceousWetlands1000'='NLCD_Sites_1000m_95'
)

rm(SitesLandUse)

make columns numeric, compute total cells/pixels for locations, select just NLCD data for sites, VGIN data for queens

SitesLandUse2[,7:21] <- lapply(SitesLandUse2[,7:21], as.numeric) #change values to numeric

SitesLandUse2 <- SitesLandUse2 %>% 
  rowwise() %>% mutate(TotalCells1000_NLCD = rowSums(across('NLCD_Water1000':'NLCD_HerbaceousWetlands1000'))) #create new columns that equal the sum of all cells within 1000m of site centerpoint

PollenIDs

#read in tables of ASV assignments for each sample 
#(datasets called 'long' because each sample spans multiple rows, one for each ASV within a sample)
#for rbcL
rbcL_IDs <- read.csv("rbcL.IDs.long.csv")
rbcL_IDs$Sample <- as.factor(rbcL_IDs$Sample)


#repeat for ITS2
ITS2_IDs <- read.csv("ITS2.IDs.long.csv")
ITS2_IDs[ITS2_IDs=="s_NA"]<-NA #explicitly call these text strings NA values
ITS2_IDs[ITS2_IDs=="g_NA"]<-NA

ITS2_IDs$Sample <- as.factor(ITS2_IDs$Sample)

#bind the rbcL and ITS2 sequence taxonomies
PollenIDs<-rbind(
  rbcL_IDs %>% 
  select(Sample, SampleTotalReads, Reads, Family, Genus, Species) %>% 
  mutate(Family=str_extract(Family, "(?<=f__)[a-zA-Z\\s]+(?=_)"),
         Genus=str_extract(Genus, "(?<=g__)[a-zA-Z\\s]+(?=_)"), #extracting character strings so taxa names will match up with those in ITS2 dataset
         Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+(?=_)"), #god bless chatgpt for this one doggamm
         Barcode="rbcL") %>%
    filter(SampleTotalReads>999) #filter out samples with less than 1000 reads in total
, # <-comma for rbind(rbcl,ITS2)
# Explanation of the Regex
# (?<=s__) - This is a lookbehind assertion that ensures the match occurs after "s__".
# [a-zA-Z\\s]+ - This matches one or more letters (both uppercase and lowercase) and spaces. (the + is important!)
# (?=_) - This is a lookahead assertion that ensures the match occurs before an underscore (_).
ITS2_IDs %>% 
  select(Sample, SampleTotalReads, Reads, Family, Genus, Species) %>% 
  mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
         Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"), #character string a little diff for ITS2
         Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"), 
         Barcode="ITS2") %>%
  filter(SampleTotalReads>999) #filter out samples with less than 1000 reads in total
) %>% 
  mutate(BeeID=Sample, #rename variable
    #Barcode_Sample=paste(Barcode,Sample,sep = "_"), #create new variable with Barcode and Sample name appended together
    Genus_species=paste(Genus,Species,sep = "_")) %>%
  select(BeeID,Barcode,SampleTotalReads,Family,Genus,Species,Genus_species, Reads) %>% 
  mutate(PropReads=Reads/SampleTotalReads) %>%
# the data are still presented as ASVs (meaning multiple ASVs could be assigned to the same Genus/species, so now we will sum the reads for each ASV assigned to the same taxon for each sample) 
  group_by(across(BeeID:Genus_species)) %>% # group by sample and taxonomic assignment 
  summarise(across(Reads:PropReads, ~ sum(.x ))) %>% # and sum reads (or prop) 
  mutate(Family = # correcting out-dated taxonomic names inherited from rbcL reference database (affecting rbcL samples only)
           ifelse(.data$Family=="Aceraceae", "Sapindaceae", #change Acer../Hipp.. to Sapi..
                  ifelse(.data$Family=="Hippocastanaceae", "Sapindaceae",
                         ifelse(.data$Genus=="Nyssa", "Nyssaceae", #change Corn.. to Nyss (I only have two species from Cornaceae in my data, Nyssa sylvatica, which is now in Nyssaceae family, and Cornus spp, which remains in Cornaceae family); link: https://doi.org/10.1111/boj.12385
                         .data$Family))))
## `summarise()` has grouped output by 'BeeID', 'Barcode', 'SampleTotalReads',
## 'Family', 'Genus', 'Species'. You can override using the `.groups` argument.
PollenIDs <- as.data.frame(PollenIDs) # revert tibble to dataframe


rm(ITS2_IDs, rbcL_IDs)

Summarize Data

CapLog.sum & CapLog.spp

# generate summary table of queens captured by site-visit, behavior, pollen status
CapLog.sum <- CapLog %>%           #
  group_by(SiteID) %>%             ## at each site,
  summarise(CapFly=sum(Flying),    ## sum number of bees captured while flying
            CapForag=sum(Forage),  ## ...or foraging
            CapNest=sum(NestSeek), ## ..or nest-seeking
            TotalCaptured=sum(Flying,Forage,NestSeek),
            PollenBees=sum(CarryPollen))


# generate summary table of queens captured by site-visit and Bombus species
CapLog.spp <- CapLog %>% 
  group_by(SiteID, Bombus_spp) %>% 
  tally() %>% 
  pivot_wider(names_from = Bombus_spp, values_from = n) %>%
  select(SiteID,bimaculatus,griseocollis,impatiens,auricomus,'vagans-sandersoni-perplexus',fervidus,pensylvanicus)

SitDat.sum

# generate summary table of start/end times and temps and observed tallys of queens by survey-date 
## (this step was needed because Dave would sometimes do 2+ surveys at Blandy in a day, recording site conditions each time)
SitDat.sum <- as.data.frame(
  bind_rows(
  SitDat %>% filter(Name!="Blandy Experimental Farm"),
  SitDat %>% filter(Name=="Blandy Experimental Farm") %>% group_by(Date) %>%
                dplyr::summarize(SiteID=first(SiteID), Name=first(Name), Date=first(Date),
                  across('StartTimeH':'StartTimeM', list(dplyr::first)),
                  across('EndTimeH':'EndTempC', list(dplyr::last)),
                  across('DurationM', list(sum)),
                  across('CountFlying':'TotalObserved',list(sum))) %>%
                  'colnames<-'(c("Date","SiteID","Name", colnames(SitDat[,4:13]))) #renaming the summarized columns so they will match up with the original, un-summarized non-Blandy records
            ))
# The above code grabs rows from SitDat for sites OTHER than Blandy and combines (bind_rows) with the following:
# A summary of rows from SitDat for ONLY Blandy, grouped by Date, where I want
#   first Date, Name, and SiteID 
#   the first StartTime, and last EndTime and EndTemp
#   the sum of survey Duration
#   the sum of the Count columns (which are expected to be zero for Blandy, since Dave didn't count bees not caught)

rm(SitDat)

Effort.sum

# generate summary table of search time 
Effort.sum <- Effort %>% group_by(SiteID) %>%      #group by site
  summarise(SearchTime=sum(SearchTime))

SiteEffort

SiteEffort <- left_join(SitDat.sum,Effort.sum,by="SiteID")
SiteEffort$Date<-as.POSIXct(SiteEffort$Date, format="%d-%b-%y")

rm(Effort)

FlorSpp.sum

FlorSpp.sum <- FlorSpp %>% group_by(SiteID) %>%
  summarize(FloralRichness=n())

LandUse.sum

###SiteLandUse.sum, SiteLandUse.sum2

# generate summary table of land cover near survey sites with *simplified* land cover classes (sum)
# expressed as proportional cover (rather than numbers of cells) ----
SitesLandUse.sum <- SitesLandUse2 %>%  
    mutate(across('NLCD_Water1000':'NLCD_HerbaceousWetlands1000',~.x/TotalCells1000_NLCD)) %>%
                ## ^^ divide all values ('~.x') by TotalCells for the given dataset
    mutate(
        #simplify land use categories
        NLCD1000_Water     = NLCD_Water1000, 
        NLCD1000_Developed = sum(NLCD_DevelopedOpen1000 + NLCD_DevelopedLow1000 + NLCD_DevelopedMed1000 + NLCD_DevelopedHigh1000),
        NLCD1000_Forest    = sum(NLCD_DeciduousForest1000 + NLCD_MixedForest1000 + NLCD_WoodyWetlands1000 + NLCD_ShrubScrub1000 + NLCD_EvergreenForest1000), #shrubscrub = forest bc in my dataset it is rare, and is sometimes misclassified (when tree farms, fields with sparse tree), but it usually has trees, so forest seems appropriate
        NLCD1000_Herbaceous= sum(NLCD_Grassland1000 + NLCD_PastureHay1000 + NLCD_HerbaceousWetlands1000 + NLCD_Barren1000), #barren = grassland bc I checked on the few pixels in my dataset and it was a mis-assignment in the original land cover data (the aerial imagery clearly shows grassy field where barren was assigned)
        NLCD1000_Crop      = NLCD_CultivatedCrops1000
          ) %>%
  select(Name:Updated_y,NLCD1000_Water:NLCD1000_Crop)
rowSums(SitesLandUse.sum[,c(7:11)]) #OH YAY they all equal one 
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
summary(SitesLandUse.sum[,-c(1:6)])
##  NLCD1000_Water      NLCD1000_Developed NLCD1000_Forest   NLCD1000_Herbaceous
##  Min.   :0.0000000   Min.   :0.002023   Min.   :0.01677   Min.   :0.02255    
##  1st Qu.:0.0002889   1st Qu.:0.058071   1st Qu.:0.18507   1st Qu.:0.11276    
##  Median :0.0018768   Median :0.095361   Median :0.27444   Median :0.25808    
##  Mean   :0.0111786   Mean   :0.261119   Mean   :0.38044   Mean   :0.30326    
##  3rd Qu.:0.0060707   3rd Qu.:0.429609   3rd Qu.:0.56432   3rd Qu.:0.45714    
##  Max.   :0.0988154   Max.   :0.852227   Max.   :0.90546   Max.   :0.68091    
##  NLCD1000_Crop      
##  Min.   :0.0000000  
##  1st Qu.:0.0000000  
##  Median :0.0004338  
##  Mean   :0.0440006  
##  3rd Qu.:0.0417973  
##  Max.   :0.3327551
SiteLandUseSummary <- as.data.frame(SitesLandUse.sum[,-c(1:6)]) %>% 
  pivot_longer(.,cols=starts_with("NLCD"), names_to = "names", values_to = "value") %>%
  select(names,value) %>%
  separate_wider_delim(., names, delim="_", names = c("scale","class")) %>%
  mutate_at("value", as.numeric) %>%
  mutate_at("class", as.factor)
ggplot(SiteLandUseSummary, aes(x=class,y=value))+
  geom_boxplot()+
  ylab("Proportion of Area") +
  xlab("Land Cover Type") +
  theme_bw() +
  theme( 
        axis.title = element_text(size=18), 
        axis.text = element_text(size=12))

rm(SiteLandUseSummary)

boxplot(x = as.list(as.data.frame(SitesLandUse.sum[,-c(1:6)])))

PollenIDs.sum

# create a summary of pollen community for each barcode-sample at the genus level
# within a sample, multiple ASVs may have assigned to the same taxonomy, so we sum across those records
PollenIDgenera.sum <- PollenIDs %>%
  group_by(Barcode, BeeID, Genus) %>% #group by barcode, sample/BeeID, and assigned genus
  summarize(PropReads=sum(PropReads)) # sum the proportion of reads
## `summarise()` has grouped output by 'Barcode', 'BeeID'. You can override using
## the `.groups` argument.
# create a list of bees/samples with data from both barcodes
TwoBarcodeBees<-PollenIDgenera.sum %>% 
  select(BeeID, Barcode) %>% 
  distinct() %>% group_by(BeeID) %>% 
  summarise(nBarcode=n()) %>% 
  filter(nBarcode==2) %>% select(BeeID)


# create a summary of pollen community for each sample at the genus level (average of both barcodes)
PollenIDgenera.comm <- PollenIDs %>%
  group_by(BeeID, Genus) %>% # NOT GROUPING BY BARCODE (mean prop reads across barcodes)
  summarize(PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1))) %>% 
  pivot_wider(., names_from = Genus, values_from = PropReads, values_fill = 0) 
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
# create list of genera appearing in a single sample
singletonGenera<-as.data.frame(
  PollenIDs %>% 
    group_by(Genus) %>% 
    summarize(n_bees=n_distinct(BeeID)) %>% 
   filter(n_bees==1) %>% 
    select(Genus)
  )$Genus

Join Summaries Together

dat.Sites <-  
  left_join(SiteEffort, CapLog.sum, by='SiteID') %>% #join site-effort to capture log summary
  left_join(., CapLog.spp, by='SiteID') %>% #join to that the capture log species summary
#install fuzzyjoin & comparator
  mutate(across(bimaculatus:pensylvanicus, ~replace_na(.x, 0))) %>%
  left_join(., SitesLandUse.sum, by = "Name") %>%
  select(-"SiteID.y") %>% #remove redundant column
  rename("SiteID"="SiteID.x") %>%  #fix column name messed up by joining
  mutate(across(CountFlying:pensylvanicus, ~replace_na(.,0))) %>% #replace NAs with 0s
#the current column (" . ")
  left_join(.,FlorSpp.sum,by='SiteID') %>% #join floral richness
  mutate(FloralRichness=if_else(is.na(FloralRichness)&Name!="Blandy Experimental Farm",0,FloralRichness))%>%
  mutate(Bees_60min=((TotalObserved+TotalCaptured)/SearchTime)*60, #calculate bees per 60-min survey
         logBees_60min=log(Bees_60min+1)) #calculate log() of that value



dat.Queen <-left_join(CapLog,PollenIDgenera.comm,by="BeeID") %>% #join capture log with pollen summary (both barcodes averaged)
  filter(!is.na(Cercis)) #filter rows without metabarcoding data


#rm(PollenIDgenera.comm)

Summary of pollen samples

Number of samples per barcode, reads per barcode-sample

#this dataset has Bombus_spp identity and keeps data from the two barcodes separate
dat<- left_join(
  PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(BeeID, Bombus_spp), 
  by="BeeID") %>% 
  ungroup()

# table of count of ITS2, rbcL, both samples by Bombus species
dat %>% group_by(BeeID, Bombus_spp, Barcode) %>% summarise(count=n()) %>% 
  pivot_wider(., names_from = "Barcode", values_from = "count") %>% 
  mutate(across(ITS2:rbcL, ~replace_na(.x, 0))) %>% 
  reframe(BeeID=first(BeeID), Bombus_spp=first(Bombus_spp), ITS2=sum(ITS2), rbcL=sum(rbcL), both=ifelse(.data$ITS2 + .data$rbcL==2, "Y", "N")) %>% 
  group_by(Bombus_spp, both) %>% summarize(totalITS2=sum(ITS2), totalrbcL=sum(rbcL)) %>% 
  pivot_wider(., names_from = "both", values_from = c(totalITS2, totalrbcL)) %>%
  rename("ITS2"="totalITS2_N", "rbcL"="totalrbcL_N", "both"="totalITS2_Y") %>% 
  select(ITS2, rbcL, both) %>%
  mutate(across(ITS2:both, ~replace_na(.x, 0)))
## `summarise()` has grouped output by 'BeeID', 'Bombus_spp'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'Bombus_spp'. You can override using the
## `.groups` argument.
## Adding missing grouping variables: `Bombus_spp`
## # A tibble: 7 × 4
## # Groups:   Bombus_spp [7]
##   Bombus_spp                   ITS2  rbcL  both
##   <fct>                       <int> <int> <int>
## 1 auricomus                       0     1     2
## 2 bimaculatus                     6    22    28
## 3 fervidus                        0     5     3
## 4 griseocollis                    4     4    18
## 5 impatiens                       0     0    11
## 6 pensylvanicus                   0     0     1
## 7 vagans-sandersoni-perplexus     0     1     1
# how many barcode-samples
dim(PollenIDs %>% 
  group_by(BeeID, Barcode) %>%
  summarize(
    n=n()
  )) # three columns (BeeID, Barcode, n or number of records), and 171 BeeID-Barcode groups
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## [1] 171   3
# average and sd reads per barcode-sample
PollenIDs %>%
  group_by(BeeID, Barcode) %>%
  summarise(
    sumReads=sum(Reads)
  ) %>%
  group_by(Barcode) %>%
  summarise(
    mean=mean(sumReads), 
    sd=stats::sd(sumReads),
    min=min(sumReads),
    max=max(sumReads)
    )
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 5
##   Barcode  mean    sd   min   max
##   <chr>   <dbl> <dbl> <int> <int>
## 1 ITS2    4579. 3604.  1062 16430
## 2 rbcL    9284. 5380.  1161 22161
# count of pollen samples by month and site
left_join(PollenIDgenera.comm %>% select(BeeID), 
          dat.Queen %>% select(BeeID, SiteID), 
          by="BeeID") %>% 
  left_join(., 
            dat.Sites %>% select(SiteID, Name, Date), 
            by="SiteID") %>%
  mutate(SimpName=ifelse(.data$Name=="Blandy Experimental Farm", Name, "All Other"),
         Month=lubridate::month(Date)) %>%
  group_by(Month, SimpName) %>% summarize(n=n()) %>%
  pivot_wider(names_from = Month, values_from = n, values_fill = 0)
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 4
##   SimpName                   `3`   `4`   `5`
##   <chr>                    <int> <int> <int>
## 1 All Other                    2    50     0
## 2 Blandy Experimental Farm     0    36    19

Assignment rate

#total reads
PollenIDs %>%
  summarise(Reads=sum(Reads))
##     Reads
## 1 1239407
#number of reads UNassigned to Species, by barcode
NA_reads<-PollenIDs %>%
  group_by(Barcode, Species) %>%
  summarise(Reads=sum(Reads)) %>%
  filter(is.na(Species))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
#number of reads assigned to Species, by barcode
Assigned_reads<-PollenIDs %>%
  group_by(Barcode, Species) %>%
  summarise(Reads=sum(Reads)) %>%
  filter(!is.na(Species)) %>%
  group_by(Barcode) %>%
  summarise(Reads=sum(Reads))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
TotalReads<-NA_reads$Reads+Assigned_reads$Reads
#percent of reads *ASSIGNED* to Species, by barcode
(Assigned_reads$Reads / TotalReads) * 100 #ITS2 appears first (alphabetical)
## [1] 37.60540 73.45585
#number of reads UNassigned to Genus, by barcode
NA_reads<-PollenIDs %>%
  group_by(Barcode, Genus) %>%
  summarise(Reads=sum(Reads)) %>%
  filter(is.na(Genus))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
#number of reads assigned to Genus, by barcode
Assigned_reads<-PollenIDs %>%
  group_by(Barcode, Genus) %>%
  summarise(Reads=sum(Reads)) %>%
  filter(!is.na(Genus)) %>%
  group_by(Barcode) %>%
  summarise(Reads=sum(Reads))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
TotalReads<-NA_reads$Reads+Assigned_reads$Reads
#percent of reads *ASSIGNED* to genus, by barcode
(Assigned_reads$Reads / TotalReads) * 100 #ITS2 appears first (alphabetical)
## [1] 99.16266 89.68581
#number of reads UNassigned to Family, by barcode
NA_reads<-PollenIDs %>%
  group_by(Barcode, Family) %>%
  summarise(Reads=sum(Reads)) %>%
  filter(is.na(Family))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
#number of reads assigned to Family, by barcode
Assigned_reads<-PollenIDs %>%
  group_by(Barcode, Family) %>%
  summarise(Reads=sum(Reads)) %>%
  filter(!is.na(Family)) %>%
  group_by(Barcode) %>%
  summarise(Reads=sum(Reads))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
TotalReads<-NA_reads$Reads+Assigned_reads$Reads
#percent of reads *ASSIGNED* to Family, by barcode
(Assigned_reads$Reads / TotalReads) * 100 #ITS2 appears first (alphabetical)
## [1] 99.16266 89.68581
rm(NA_reads, Assigned_reads, TotalReads)

Rarefaction curves

#rarefaction of pollen species richness and read depth within each barcode-sample
dat<-PollenIDs %>% # (2 barcodes kept separate) # Genus_species
  group_by(Barcode, BeeID, Genus_species) %>% 
  summarize(Reads=sum(Reads)) %>%
  filter(Genus_species!="NA_NA") %>%
  pivot_wider(names_from = Genus_species, values_from = Reads, values_fill = 0) %>%
#  mutate(rowname=paste(.data$Barcode, .data$BeeID, sep="_")) %>%
ungroup() %>% select(-c(BeeID, Barcode))
## `summarise()` has grouped output by 'Barcode', 'BeeID'. You can override using
## the `.groups` argument.
dat<-as.data.frame(dat)
dat <- dat[rowSums(dat) > 0, ]

vegan::rarecurve(dat, step=20, tidy=T) %>%
  ggplot(aes(x=Sample, y=Species, col=Site))+
  geom_line()+
  labs(x="Read Depth", y="Species detected", col="")+
  theme(legend.position = "none") +
  guides(color="none")+
  lims(x=c(0,5000),y=c(0,25))+
  theme_bw(base_size = 18)
## Warning: Removed 24853 rows containing missing values or values outside the scale range
## (`geom_line()`).

#rarefaction of pollen species richness and sample size within each of the most common bombus species
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
  filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") %>%
#  filter(.data$Genus %in% topGenera2$Genus) %>% #optional filter for most common genera
  group_by(Bombus_spp, Genus) %>% #first summarize across BeeID within Bombus_spp, pollen Genus
  summarize(nSamples=n_distinct(BeeID)) %>% 
  filter(!is.na(Genus)) %>%
  pivot_wider(names_from = Genus, values_from = nSamples, values_fill = 0) %>%
ungroup() %>% select(-c(Bombus_spp))
## `summarise()` has grouped output by 'Bombus_spp'. You can override using the
## `.groups` argument.
dat<-as.data.frame(dat)
dat <- dat[rowSums(dat) > 0, ]

vegan::rarecurve(dat, step=3, tidy=T) %>% 
  mutate(Site=case_match(Site, 
                         "1"~"bimaculatus",
                         "2"~"griseocollis",
                         "3"~"impatiens"))%>%
  ggplot(aes(x=Sample, y=Species, col=Site))+
  geom_line()+
  labs(x="Number of Pollen Samples", y="Genera detected", col="Bombus spp")+
  theme_bw(base_size = 18) 

Rank abundance and genera richness

rank_abundance = PollenIDs %>% 
  select(BeeID, Barcode, Family, Genus, PropReads, SampleTotalReads) %>%
  arrange(BeeID, Barcode, desc(PropReads)) %>%
  group_by(BeeID, Barcode) %>%
  mutate(abundance=PropReads,
    rank = rank(desc(PropReads), ties.method= "random")) %>% 
  mutate(rank=factor(rank)) %>%
  filter(!is.na(Family) & !is.na(Genus)) %>%
  ungroup()

summary(rank_abundance)
##      BeeID        Barcode             Family             Genus          
##  Bg011  :  31   Length:1363        Length:1363        Length:1363       
##  KLS0201:  26   Class :character   Class :character   Class :character  
##  Bg005  :  25   Mode  :character   Mode  :character   Mode  :character  
##  Bf001  :  24                                                           
##  Bb003  :  23                                                           
##  Bi002  :  23                                                           
##  (Other):1211                                                           
##    PropReads         SampleTotalReads   abundance              rank    
##  Min.   :0.0000602   Min.   : 1062    Min.   :0.0000602   1      :161  
##  1st Qu.:0.0031286   1st Qu.: 3836    1st Qu.:0.0031286   2      :150  
##  Median :0.0136636   Median : 7020    Median :0.0136636   3      :138  
##  Mean   :0.1158430   Mean   : 8516    Mean   :0.1158430   4      :121  
##  3rd Qu.:0.0868255   3rd Qu.:12842    3rd Qu.:0.0868255   5      :106  
##  Max.   :1.0000000   Max.   :22161    Max.   :1.0000000   6      : 90  
##                                                           (Other):597
# plot with rank abundance curve
rank_abundance  %>%
ggplot(aes(x = rank,
           y = abundance)) + 
  geom_boxplot(aes(fill=Barcode), position = position_dodge(preserve = "single"))+
  theme_bw(base_size=18)+
  scale_x_discrete(breaks = function(x){x[c(TRUE, FALSE)]})+
  facet_wrap(~Barcode) +
  labs(x="Abundance Rank", y="Proportion of Reads")

# overall genera richness of samples
PollenIDs %>%
  group_by(BeeID)%>%
  summarise(GeneraRichness=n_distinct(Genus)) %>%
  summarize(mean=mean(GeneraRichness))
## # A tibble: 1 × 1
##    mean
##   <dbl>
## 1  10.6
# by barcode, genera richness of samples
PollenIDs %>%
  group_by(BeeID,Barcode)%>%
  summarise(GeneraRichness=n_distinct(Genus)) %>%
  group_by(Barcode) %>%
  summarize(mean=mean(GeneraRichness), median=stats::median(GeneraRichness), low=min(GeneraRichness), high=max(GeneraRichness))
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 5
##   Barcode  mean median   low  high
##   <chr>   <dbl>  <dbl> <int> <int>
## 1 ITS2     4.16      4     1    10
## 2 rbcL     9.72      9     1    21
# plot a histogram of barcod-samples' genera richness
PollenIDs %>%
  group_by(BeeID,Barcode)%>%
  summarise(GeneraRichness=n_distinct(Genus)) %>%
  ggplot(aes(x=GeneraRichness)) +
  geom_histogram()+
  facet_wrap(~Barcode)
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rm(rank_abundance)

Most, least common pollen genera

# counting genera in pollen data
PollenIDs %>% distinct(Genus) # 94 total genera detected (plus 1 for NA/unassigned reads)
##            Genus
## 1         Prunus
## 2       Lonicera
## 3         Cercis
## 4           <NA>
## 5      Paulownia
## 6          Pinus
## 7       Platanus
## 8          Salix
## 9       Viburnum
## 10   Liquidambar
## 11      Brassica
## 12      Hesperis
## 13     Elaeagnus
## 14      Glechoma
## 15  Anthoxanthum
## 16         Viola
## 17     Mertensia
## 18       Quercus
## 19      Dicentra
## 20       Lupinus
## 21         Morus
## 22        Lamium
## 23    Polemonium
## 24 Chamaecyparis
## 25      Veronica
## 26     Taraxacum
## 27     Cardamine
## 28    Potentilla
## 29          Acer
## 30       Asimina
## 31         Vinca
## 32      Berberis
## 33  Cunninghamia
## 34        Lolium
## 35           Poa
## 36  Buglossoides
## 37      Medicago
## 38   Cynoglossum
## 39     Euscaphis
## 40  Mummenhoffia
## 41     Exochorda
## 42        Akebia
## 43      Magnolia
## 44         Pyrus
## 45     Trifolium
## 46        Ginkgo
## 47         Picea
## 48      Barbarea
## 49       Packera
## 50       Halesia
## 51   Podophyllum
## 52      Fraxinus
## 53      Dactylis
## 54       Robinia
## 55      Wisteria
## 56          Geum
## 57       Thlaspi
## 58   Stylophorum
## 59      Virgilia
## 60    Ranunculus
## 61       Syringa
## 62     Crataegus
## 63      Erigeron
## 64      Leucojum
## 65   Chelidonium
## 66      Photinia
## 67  Liriodendron
## 68    Fontanesia
## 69         Nyssa
## 70      Aesculus
## 71          Rosa
## 72     Gleditsia
## 73     Celastrus
## 74         Vitis
## 75    Securigera
## 76    Pulmonaria
## 77  Lamprocapnos
## 78        Aronia
## 79   Pachysandra
## 80     Gelsemium
## 81        Scilla
## 82         Malus
## 83     Vaccinium
## 84         Fagus
## 85         Vicia
## 86 Toxicodendron
## 87     Impatiens
## 88     Ligustrum
## 89   Gaylussacia
## 90        Cornus
## 91      Alliaria
## 92         Rubus
## 93       Anemone
## 94        Isatis
## 95       Spiraea
## 96        Phleum
#list of all singletonGenera (genera detected in only one sample, by either barcode)
singletonGenera<-as.data.frame( 
  PollenIDs %>% 
    group_by(Genus) %>% 
    summarize(n_bees=n_distinct(BeeID)) %>% 
   filter(n_bees==1) %>% 
    select(Genus)
  )$Genus


# identify most common pollen by applying filters on genera-level abunance metrics
temp<-PollenIDs %>%  
  group_by(BeeID, Genus) %>% 
  summarize(
    PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
    Barcodes=paste(unique(.data$Barcode), collapse = ", "),
    Reads=sum(Reads)) %>% 
  group_by(Genus) %>% 
  summarize(Barcodes=Barcodes[which.max(str_length(Barcodes))],
            nSamples=n_distinct(BeeID),
            meanPropReads=mean(PropReads),
            maxPropReads=max(PropReads),
            Reads=sum(Reads)) %>%
  mutate(Barcodes=recode(Barcodes, 'ITS2, rbcL'="both", "rbcL, ITS2" = "both")) %>% 
  as.data.frame() %>%
#  filter(.data$Genus %in% topGenera1$Genus) %>%
  filter(nSamples>5) %>% # filter by number of samples
  filter(maxPropReads>0.25) %>% # filter genera with >25% max prop reads w/in a sample
  filter(meanPropReads>0.025) %>% #and filter genera with >2.5% mean prop reads
  pivot_longer(., cols=c(nSamples:Reads), 
               names_to = "metric", 
               values_to = "value") %>% 
  mutate(metric=factor(metric, levels=c("nSamples", "meanPropReads", "maxPropReads", "Reads"))) %>%
  drop_na(Genus)
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
temp %>% filter(metric=="Reads") %>% mutate(topGeneraTotalReads=sum(value)) %>% select(topGeneraTotalReads) %>% distinct() # 947562 reads assigned to top genera
## # A tibble: 1 × 1
##   topGeneraTotalReads
##                 <dbl>
## 1              947562
947562/sum(PollenIDs$Reads) #~76% of ALL reads belong to one of the top genera
## [1] 0.7645285
PollenIDs %>% filter(is.na(Genus)) %>% mutate(NAReads=sum(Reads)) %>% select(NAReads) %>% distinct() #95726 unassigned (NA) reads
##   NAReads
## 1   95726
947562/(sum(PollenIDs$Reads)-95726) #~83% of assigned reads belong to one of the top genera
## [1] 0.8285195
topGenera2<-temp$Genus %>% as_tibble() %>% distinct() %>% as.data.frame()
colnames(topGenera2)<-"Genus"


temp %>%
  ggplot() + 
  geom_tile(data = temp %>% filter(metric=="nSamples") %>% droplevels, 
    aes(metric, Genus, fill=value)) +
  geom_text(data = temp %>% filter(metric=="nSamples") %>% droplevels, 
    aes(metric, Genus, 
        label=format(value, digits=2), 
        size=4)) +
  scale_fill_gradientn(name = "nSamples", colors=c("white","gold")) + 

  ggnewscale::new_scale_fill() + 
  geom_tile(data = temp %>% filter(metric=="meanPropReads") %>% droplevels, 
    aes(metric, Genus, fill=value)) + 
  geom_text(data = temp %>% filter(metric=="meanPropReads") %>% droplevels, 
    aes(metric, Genus, 
        label=format(value, digits=1), 
        size=4)) +
  scale_fill_gradientn(name = "meanPropReads", colors=c("white","gold")) + 
  ggnewscale::new_scale_fill() + 
  geom_tile(data = temp %>% filter(metric=="maxPropReads") %>% droplevels, 
    aes(metric, Genus, fill=value)) +
  geom_text(data = temp %>% filter(metric=="maxPropReads") %>% droplevels, 
    aes(metric, Genus, 
        label=format(value, digits=2), 
        size=4)) +
  scale_fill_gradientn(name = "maxPropReads", colors=c("white","gold")) + 

  scale_y_discrete(limits=rev, 
                   labels=paste(sort(unique(temp$Genus), decreasing = T), 
                                paste0("(", (temp %>% 
                                               group_by(Genus) %>% 
                                               summarize(Barcodes=first(Barcodes)) %>% 
                                               arrange(desc(Genus))
                                             )$Barcodes,
                                  ")"), sep=" ")) + # alternate:, sep="\n"
   scale_x_discrete(limits = c("nSamples","meanPropReads","maxPropReads"),
                    labels = c("Number of \n Samples \n (total)",
                               "Proportion of \n Sample Reads \n (mean)",
                               "Proportion of \n Sample Reads \n (max)")) +
  ylab("Pollen Genus (detected by)") +
  theme_bw(base_size = 18)+
    theme(legend.position = "none", 
          axis.title.x = element_text(size=0)) 

#calculating the min and max values of metrics
PollenIDs %>%  
  group_by(BeeID, Genus) %>% 
  summarize(
    PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
    Barcodes=paste(unique(.data$Barcode), collapse = ", "),
    Reads=sum(Reads)) %>% 
  group_by(Genus) %>% 
  summarize(Barcodes=Barcodes[which.max(str_length(Barcodes))],
            nSamples=n_distinct(BeeID),
            Reads=sum(Reads), 
            meanPropReads=round(mean(PropReads),digits=2),
            minPropReads=round(min(PropReads),digits=2),
            maxPropReads=round(max(PropReads),digits=2),
            sdPropReads=round(stats::sd(PropReads),digits=2)) %>%
  mutate(Barcodes=recode(Barcodes, 'ITS2, rbcL'="both", "rbcL, ITS2" = "both")) %>% 
  as.data.frame() %>%
  filter(nSamples>5) %>% # filter by number of samples
  filter(maxPropReads>0.25) %>% # filter genera with >25% max prop reads w/in a sample
  filter(meanPropReads>0.025) #and filter genera with >2.5% mean prop reads
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
##           Genus Barcodes nSamples  Reads meanPropReads minPropReads
## 1          Acer     both       36   8197          0.03         0.00
## 2      Aesculus     both        6  32238          0.40         0.04
## 3      Barbarea     rbcL        7  13453          0.16         0.00
## 4        Cercis     both       44 174003          0.26         0.00
## 5     Elaeagnus     rbcL       31  20845          0.05         0.00
## 6      Glechoma     rbcL       29  40676          0.11         0.00
## 7       Halesia     ITS2       12  26793          0.31         0.00
## 8        Lamium     both       46 135135          0.25         0.00
## 9  Liriodendron     rbcL       19  19506          0.10         0.00
## 10     Lonicera     rbcL       28  64660          0.20         0.00
## 11    Mertensia     ITS2       25  19491          0.14         0.00
## 12 Mummenhoffia     ITS2       13   5597          0.07         0.00
## 13    Paulownia     both       10  13567          0.11         0.00
## 14        Pinus     rbcL       85  57281          0.06         0.00
## 15       Prunus     both       37  81502          0.21         0.00
## 16      Quercus     both       30   9702          0.05         0.00
## 17        Salix     rbcL       41 107905          0.19         0.00
## 18    Taraxacum     ITS2       19   6118          0.07         0.00
## 19    Vaccinium     both        9  19670          0.14         0.00
## 20     Viburnum     both       29  10381          0.04         0.00
## 21        Viola     both       26  80842          0.14         0.00
## 22         <NA>     both       92  95726          0.09         0.00
##    maxPropReads sdPropReads
## 1          0.65        0.11
## 2          0.77        0.29
## 3          0.64        0.24
## 4          0.98        0.37
## 5          0.28        0.08
## 6          0.48        0.14
## 7          1.00        0.40
## 8          0.98        0.32
## 9          0.99        0.25
## 10         0.96        0.29
## 11         0.47        0.18
## 12         0.36        0.10
## 13         0.50        0.18
## 14         0.43        0.09
## 15         1.00        0.35
## 16         0.27        0.07
## 17         0.99        0.31
## 18         0.31        0.10
## 19         0.68        0.24
## 20         0.38        0.10
## 21         0.84        0.26
## 22         0.70        0.14
rm(temp)

Customized color palettes for plotting the common genera/families

# colors based on this palette
# colorBlindness::SteppedSequential5Steps

myColors.FamGen <- c("#990F0F", "#B22C2C" , "#CC5151" , "#E57E7E" , "#FFB2B2" , "#99540F" , "#B26F2C" , "#CC8E51" , "#E5B17E" , "#FFD8B2" , "#6B990F" , "#85B22C" , "#A3CC51" , "#C3E57E" , "#E5FFB2" , "#0F6B99" , "#2C85B2" , "#51A3CC" , "#7EC3E5" , "#B2E5FF" , "#260F99" , "#8F7EE5", "black")
names(myColors.FamGen) <- 
  sort(c(levels(
    factor(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(FamGen))$FamGen,"z_AllOther"), 
    as.character(unique(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(FamGen))$FamGen,"z_AllOther"))))
    ), "z_Shared"))



myColors.topGen <- c("#990F0F", "#B22C2C" , "#CC5151" , "#E57E7E" , "#FFB2B2" , "#99540F" , "#B26F2C" , "#CC8E51" , "#E5B17E" , "#FFD8B2" , "#6B990F" , "#85B22C" , "#A3CC51" , "#C3E57E" , "#E5FFB2" , "#0F6B99" , "#2C85B2" , "#51A3CC" , "#7EC3E5" , "#B2E5FF" , "#260F99" , "#8F7EE5" , "#BFB2FF")
names(myColors.topGen) <- 
  levels(
    factor(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(Genus))$Genus, "z_AllOther"), 
    as.character(unique(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(Genus))$Genus, "z_AllOther"))))
    )

#https://stackoverflow.com/questions/25098107/how-to-order-the-levels-of-factors-according-to-the-ordering-of-a-data-frame-an

Count of species in most common pollen genera

head(PollenIDs %>% 
       filter(Genus %in% topGenera2$Genus) %>% 
       filter(!is.na(Species)) %>%
       group_by(Genus, Barcode) %>% 
       summarise(
         first=first(Species), 
         n_spp=n_distinct(Species)))
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups:   Genus [4]
##   Genus    Barcode first         n_spp
##   <chr>    <chr>   <chr>         <int>
## 1 Acer     ITS2    negundo           4
## 2 Acer     rbcL    negundo           3
## 3 Aesculus ITS2    x marilandica     2
## 4 Aesculus rbcL    hippocastanum     1
## 5 Barbarea rbcL    vulgaris          1
## 6 Cercis   rbcL    siliquastrum      1
PollenIDs %>% 
       filter(Genus %in% topGenera2$Genus) %>% 
       filter(!is.na(Species)) %>%
       group_by(Genus, Barcode) %>% 
       summarise(
         first=first(Species), 
         n_spp=n_distinct(Species)) %>% 
  group_by(Barcode) %>%
  summarize(mean=mean(n_spp), min=min(n_spp), max=max(n_spp))
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 4
##   Barcode  mean   min   max
##   <chr>   <dbl> <int> <int>
## 1 ITS2      2       1     4
## 2 rbcL      1.6     1     4

Summary of (potential or actual) floral hosts

Most common blooming plants present at sites

#floral species common at sites
head(left_join(FlorSpp, dat.Sites %>% select(SiteID, Name, Date), by="SiteID"))
##   SiteID       Family        Genus    Species         Common              Name
## 1   WCP1     Rosaceae        Rubus        spp        Bramble Walnut Creek Park
## 2   WCP1     Fabaceae       Cercis canadensis Eastern Redbud Walnut Creek Park
## 3   WCP1    Lauraceae      Lindera    benzoin      Spicebush Walnut Creek Park
## 4   WCP1 Magnoliaceae Liriodendron tulipifera   Tulip Poplar Walnut Creek Park
## 5   WCP1  Sapindaceae         Acer        spp          Maple Walnut Creek Park
## 6   WCP1    Violaceae        Viola        spp         Violet Walnut Creek Park
##         Date
## 1 2022-03-21
## 2 2022-03-21
## 3 2022-03-21
## 4 2022-03-21
## 5 2022-03-21
## 6 2022-03-21
head(FlorSpp %>%
  group_by(Genus, SiteID) %>%
  summarise(n=n()) %>%
  group_by(Genus) %>%
  summarise(n=n(), prop=n/50) %>% #divide by 50 site visits (25 sites visited twice)
  arrange(-n))
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
##   Genus         n  prop
##   <chr>     <int> <dbl>
## 1 Lamium       35  0.7 
## 2 Viola        35  0.7 
## 3 Taraxacum    32  0.64
## 4 Glechoma     28  0.56
## 5 Barbarea     19  0.38
## 6 Elaeagnus    19  0.38
# floral species richness over time (mean species/site per week)
left_join(FlorSpp, dat.Sites %>% select(SiteID, Name, Date), by="SiteID") %>%
  mutate(Genus_spp=paste(Genus, Species, sep="_")) %>%
  group_by(SiteID) %>%
  summarise(
    nGenus_spp=n_distinct(Genus_spp),
            ) %>% ungroup() %>%
left_join(., dat.Sites %>% select(SiteID, Name, Date), by="SiteID") %>%
  group_by(Week=lubridate::week(.data$Date)) %>%
  summarise(weekOf=first(Date),
            meannGenus_spp=mean(nGenus_spp),
            minnGenus_spp=min(nGenus_spp),
            maxnGenus_spp=max(nGenus_spp),
            n=n()
            )
## # A tibble: 7 × 6
##    Week weekOf              meannGenus_spp minnGenus_spp maxnGenus_spp     n
##   <dbl> <dttm>                       <dbl>         <int>         <int> <int>
## 1    12 2022-03-22 00:00:00           7.22             5            10     9
## 2    13 2022-03-30 00:00:00           7.17             5            11     6
## 3    14 2022-04-04 00:00:00           8.75             3            13     8
## 4    15 2022-04-12 00:00:00           9.38             5            14     8
## 5    16 2022-04-22 00:00:00           8.75             5            13     8
## 6    17 2022-04-29 00:00:00          11.5              5            14     8
## 7    18 2022-04-30 00:00:00          12.5              8            17     2

Most common blooming plants used by queens

# Among all *foraging queens observed*, what was the most common host plant?
left_join(CapLog,PollenIDgenera.comm,by="BeeID") %>% # alt version of dat.Queen retaining bees without pollen data (but a few of these queens carried pollen, but we don't have data on it)
  filter(!is.na(HostGenus)) %>% 
  group_by(HostGenus) %>% 
  summarize(bees=n_distinct(BeeID), propbees=n_distinct(BeeID)/414) %>%  #414 total foraging bees
  arrange(-bees) %>% 
  mutate(cumsum=cumsum(propbees))
## # A tibble: 33 × 4
##    HostGenus    bees propbees cumsum
##    <chr>       <int>    <dbl>  <dbl>
##  1 ""            169   0.408   0.408
##  2 "Lamium"      132   0.319   0.727
##  3 "Elaeagnus"   107   0.258   0.986
##  4 "Glechoma"     61   0.147   1.13 
##  5 "Prunus"       24   0.0580  1.19 
##  6 "Mertensia"    18   0.0435  1.23 
##  7 "Lonicera"     12   0.0290  1.26 
##  8 "Cercis"        8   0.0193  1.28 
##  9 "Barbarea"      6   0.0145  1.30 
## 10 "Vinca"         6   0.0145  1.31 
## # ℹ 23 more rows
# Among all *queens with pollen data*, what was the most common host plant?
dat.Queen %>% 
  filter(!is.na(HostGenus)) %>% 
  group_by(HostGenus) %>% 
  summarize(bees=n_distinct(BeeID), propbees=n_distinct(BeeID)/103) %>% #103 total foraging bees with pollen data
  arrange(-bees) %>% 
  mutate(cumsum=cumsum(propbees)) 
## # A tibble: 23 × 4
##    HostGenus     bees propbees cumsum
##    <chr>        <int>    <dbl>  <dbl>
##  1 "Elaeagnus"     30   0.291   0.291
##  2 "Glechoma"      19   0.184   0.476
##  3 "Lamium"        15   0.146   0.621
##  4 "Prunus"         8   0.0777  0.699
##  5 "Barbarea"       5   0.0485  0.748
##  6 ""               4   0.0388  0.786
##  7 "Mertensia"      4   0.0388  0.825
##  8 "Pyracantha"     4   0.0388  0.864
##  9 "Malus"          3   0.0291  0.893
## 10 "Cercis"         2   0.0194  0.913
## # ℹ 13 more rows

How does pollen composition change over time?

dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, SiteID, Bombus_spp, Date), by="BeeID")  %>%
  filter(str_starts(SiteID, "B")) %>% #ONLY BLANDY BEES
  mutate(FamGen=paste(Family, Genus, sep=" ") # new varible to invoke in plotting
         ) %>% 
  group_by(BeeID, Family, Genus, FamGen) %>%
  summarise(
    Date=first(Date),
    Bombus_spp=first(Bombus_spp),
    PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)), # calculate mean prop of reads for each genus for each bee
            ) %>% ungroup() %>% 
  mutate(Week=lubridate::week(Date)) %>%
  group_by(Week, Family, Genus, FamGen) %>% 
  summarise(weekly_value=sum(PropReads)) %>% ungroup() 
## `summarise()` has grouped output by 'BeeID', 'Family', 'Genus'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Week', 'Family', 'Genus'. You can override
## using the `.groups` argument.
sort(unique(dat$FamGen))
##  [1] "Adoxaceae Viburnum"         "Altingiaceae Liquidambar"  
##  [3] "Amaryllidaceae Leucojum"    "Annonaceae Asimina"        
##  [5] "Apocynaceae Vinca"          "Asteraceae Erigeron"       
##  [7] "Asteraceae Packera"         "Asteraceae Taraxacum"      
##  [9] "Berberidaceae Berberis"     "Berberidaceae Podophyllum" 
## [11] "Boraginaceae Buglossoides"  "Boraginaceae Cynoglossum"  
## [13] "Boraginaceae Mertensia"     "Boraginaceae Pulmonaria"   
## [15] "Brassicaceae Barbarea"      "Brassicaceae Brassica"     
## [17] "Brassicaceae Cardamine"     "Brassicaceae Hesperis"     
## [19] "Brassicaceae Mummenhoffia"  "Brassicaceae Thlaspi"      
## [21] "Caprifoliaceae Lonicera"    "Celastraceae Celastrus"    
## [23] "Cupressaceae Chamaecyparis" "Cupressaceae Cunninghamia" 
## [25] "Elaeagnaceae Elaeagnus"     "Fabaceae Cercis"           
## [27] "Fabaceae Gleditsia"         "Fabaceae Lupinus"          
## [29] "Fabaceae Medicago"          "Fabaceae Robinia"          
## [31] "Fabaceae Securigera"        "Fabaceae Trifolium"        
## [33] "Fabaceae Virgilia"          "Fabaceae Wisteria"         
## [35] "Fagaceae Quercus"           "Ginkgoaceae Ginkgo"        
## [37] "Lamiaceae Glechoma"         "Lamiaceae Lamium"          
## [39] "Lardizabalaceae Akebia"     "Magnoliaceae Liriodendron" 
## [41] "Magnoliaceae Magnolia"      "Moraceae Morus"            
## [43] "NA NA"                      "Nyssaceae Nyssa"           
## [45] "Oleaceae Fontanesia"        "Oleaceae Fraxinus"         
## [47] "Oleaceae Syringa"           "Papaveraceae Chelidonium"  
## [49] "Papaveraceae Dicentra"      "Papaveraceae Lamprocapnos" 
## [51] "Papaveraceae Stylophorum"   "Paulowniaceae Paulownia"   
## [53] "Pinaceae Picea"             "Pinaceae Pinus"            
## [55] "Plantaginaceae Veronica"    "Platanaceae Platanus"      
## [57] "Poaceae Anthoxanthum"       "Poaceae Dactylis"          
## [59] "Poaceae Lolium"             "Poaceae Poa"               
## [61] "Polemoniaceae Polemonium"   "Ranunculaceae Ranunculus"  
## [63] "Rosaceae Crataegus"         "Rosaceae Exochorda"        
## [65] "Rosaceae Geum"              "Rosaceae Photinia"         
## [67] "Rosaceae Potentilla"        "Rosaceae Prunus"           
## [69] "Rosaceae Pyrus"             "Rosaceae Rosa"             
## [71] "Salicaceae Salix"           "Sapindaceae Acer"          
## [73] "Sapindaceae Aesculus"       "Staphyleaceae Euscaphis"   
## [75] "Styracaceae Halesia"        "Violaceae Viola"           
## [77] "Vitaceae Vitis"
n_distinct(dat$FamGen) #plus NA
## [1] 77
sort(unique(dat$Family))
##  [1] "Adoxaceae"       "Altingiaceae"    "Amaryllidaceae"  "Annonaceae"     
##  [5] "Apocynaceae"     "Asteraceae"      "Berberidaceae"   "Boraginaceae"   
##  [9] "Brassicaceae"    "Caprifoliaceae"  "Celastraceae"    "Cupressaceae"   
## [13] "Elaeagnaceae"    "Fabaceae"        "Fagaceae"        "Ginkgoaceae"    
## [17] "Lamiaceae"       "Lardizabalaceae" "Magnoliaceae"    "Moraceae"       
## [21] "Nyssaceae"       "Oleaceae"        "Papaveraceae"    "Paulowniaceae"  
## [25] "Pinaceae"        "Plantaginaceae"  "Platanaceae"     "Poaceae"        
## [29] "Polemoniaceae"   "Ranunculaceae"   "Rosaceae"        "Salicaceae"     
## [33] "Sapindaceae"     "Staphyleaceae"   "Styracaceae"     "Violaceae"      
## [37] "Vitaceae"
n_distinct(dat$Family) #plus NA
## [1] 38
#RColorBrewer::display.brewer.all(colorblindFriendly=TRUE)
# ,
dat %>%
  ggplot()+ 
  geom_col(aes(x=Week, y=weekly_value, 
               fill=ifelse(.data$Genus %in% topGenera2$Genus, FamGen, 
                        ifelse(!is.na(.data$Genus), "z_AllOther", NA))), 
           position = "fill")+
  scale_fill_manual(name="FamGen", values = myColors.FamGen, na.value = "gray")+
  geom_label(data=PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, SiteID, Date), by="BeeID") %>%
               filter(str_starts(SiteID, "B")) %>%
               mutate(Week=lubridate::week(Date)) %>%
               group_by(Week) %>% summarize(nbees=n_distinct(BeeID)),
             aes(x=Week, y=-0.05, label=paste("n = ", nbees, sep="")))+
  guides(fill=guide_legend(title = "Family Genus")) +
  scale_x_continuous(breaks=c(unique(dat$Week)))+
  ylab("Proportion of Reads")+
  theme_bw(base_size = 18)+
  theme(legend.position = "bottom")

hist((PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, SiteID, Date), by="BeeID") %>% distinct())$Date, breaks=30)
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

Do common pollen genera correspond with floral survey detections?

# venn diagram: floral surveys intersected with most common genera
site_flowers <- FlorSpp %>% select(Genus) %>% filter(!str_ends(Genus,"NA")) %>% distinct()
pollen_flowers <- PollenIDs %>% drop_na(Genus) %>% select(Family, Genus, Species) %>% distinct(Genus)
gplots::venn(list(site_flowers$Genus, pollen_flowers$Genus), intersections = TRUE)

attr(gplots::venn(list(site_flowers$Genus, pollen_flowers$Genus), intersections = TRUE), "intersections")
## $A
##  [1] "Lindera"      "Narcissus"    "Forsythia"    "Phlox"        "Corydalis"   
##  [6] "Claytonia"    "Ficaria"      "Liriope"      "Ilex"         "Pryus"       
## [11] "Exochardo"    "Lobularia"    "Hyancinthus"  "Sympiocarpus" "Helleborus"  
## [16] "Geranium"     "Antennaria"   "Muscari"      "Houstonia"    "Erodium"     
## [21] "Thalictrum"   "Pieris"       "Buddleja"     "Oxalis"       "Stellaria"   
## [26] "Ajuga"        "Sassafras"    "Obolaria"     "Fragaria"     "Rhododendron"
## [31] "Adonis"       "Ornithogalum" "Papaver"      "Zizia"        "Aquilegia"   
## [36] "Gernium"      "Arisaema"    
## 
## $B
##  [1] "Paulownia"     "Pinus"         "Platanus"      "Liquidambar"  
##  [5] "Anthoxanthum"  "Quercus"       "Dicentra"      "Lupinus"      
##  [9] "Morus"         "Polemonium"    "Chamaecyparis" "Berberis"     
## [13] "Cunninghamia"  "Lolium"        "Poa"           "Medicago"     
## [17] "Cynoglossum"   "Euscaphis"     "Mummenhoffia"  "Exochorda"    
## [21] "Akebia"        "Ginkgo"        "Picea"         "Halesia"      
## [25] "Fraxinus"      "Dactylis"      "Robinia"       "Wisteria"     
## [29] "Geum"          "Stylophorum"   "Virgilia"      "Syringa"      
## [33] "Leucojum"      "Chelidonium"   "Photinia"      "Fontanesia"   
## [37] "Nyssa"         "Aesculus"      "Rosa"          "Gleditsia"    
## [41] "Celastrus"     "Vitis"         "Securigera"    "Pulmonaria"   
## [45] "Lamprocapnos"  "Pachysandra"   "Gelsemium"     "Scilla"       
## [49] "Fagus"         "Toxicodendron" "Impatiens"     "Ligustrum"    
## [53] "Gaylussacia"   "Anemone"       "Isatis"        "Phleum"       
## 
## $`A:B`
##  [1] "Rubus"        "Cercis"       "Liriodendron" "Acer"         "Viola"       
##  [6] "Pyrus"        "Prunus"       "Malus"        "Lamium"       "Taraxacum"   
## [11] "Mertensia"    "Lonicera"     "Glechoma"     "Cornus"       "Vinca"       
## [16] "Veronica"     "Elaeagnus"    "Cardamine"    "Buglossoides" "Ranunculus"  
## [21] "Brassica"     "Thlaspi"      "Barbarea"     "Trifolium"    "Magnolia"    
## [26] "Packera"      "Podophyllum"  "Viburnum"     "Potentilla"   "Vaccinium"   
## [31] "Alliaria"     "Erigeron"     "Vicia"        "Aronia"       "Salix"       
## [36] "Crataegus"    "Spiraea"      "Hesperis"     "Asimina"
#two interesting common genera not detected on floral surveys: Halesia and Paulownia (i recently dug up a video of a queen on Halesia lol) but I never noticed a princess tree in bloom. 
#the other 4 common genera not detected on floral surveys were trees: Chamaecyparis*, Pinus*, Platanus*, Quercus
# * meanPropReads < 0.05
rm(pollen_flowers, site_flowers)


# how does blooming plant occurrence vary over the season
FlorSpp %>% left_join(., dat.Sites %>%select(SiteID, Date), by="SiteID") %>%
    mutate(Week=lubridate::week(Date)) %>%
    group_by(Week) %>%
    mutate(nsites=n_distinct(SiteID)) %>%
    group_by(Week, Genus) %>%
    summarize(weekly_value=n_distinct(SiteID), nsites=first(nsites)) %>%
    mutate(weekly_value=weekly_value/nsites) %>%
    filter(.data$Genus %in% topGenera2$Genus & Week>13) %>%
ggplot()+
  geom_line(aes(x=Week, y=weekly_value, col=Genus), linewidth=1, position="jitter") +
  scale_color_manual(name="Genus", values = myColors.topGen, na.value = "gray")+
  ylim(0,1)
## `summarise()` has grouped output by 'Week'. You can override using the
## `.groups` argument.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).

#Cercis peaks in week 15, but remains blooming at about 1/2 the sites for the entire season
#Lamium, Viola is common for the entire season
#Elaeagnus generally increased over time
#Barbarea, Cornus increased over time
#Viburnum appears at the end
#Taraxacum decreased over time

# plot each flower genus occurrence separately
plot<-FlorSpp %>% left_join(., dat.Sites %>% select(SiteID, Date), by="SiteID") %>%
    mutate(Week=lubridate::week(Date)) %>%
    group_by(Week) %>%
    mutate(nsites=n_distinct(SiteID)) %>%
    group_by(Week, Genus) %>%
    summarize(weekly_value=n_distinct(SiteID), nsites=first(nsites)) %>%
    mutate(weekly_value=weekly_value/nsites) %>%
    filter(Week>13) %>%
ggplot()+
  geom_line(aes(x=Week, y=weekly_value), linewidth=1) +
  geom_point(aes(x=Week, y=weekly_value), size=1) +
  ylim(0,1)+
  facet_wrap(~Genus)
## `summarise()` has grouped output by 'Week'. You can override using the
## `.groups` argument.
suppressMessages(print(plot))

How does pollen composition differ between ITS2 and rbcL

Venn diagram comparing the reference databases, detected taxa

# grab reference databases (sequences with taxonomic info)
rbcL.ref.tax<-"/Users/kelseyschoenemann/Desktop/Bioinformatics/ReferenceDatabases/rbcL-KarenBell_2021.07.08/rbcL_plus_Nov2019adds_Jul2021corrections.dada2.fa"
PLANiTS.sntx.kls <- "/Users/kelseyschoenemann/Desktop/Bioinformatics/ReferenceDatabases/ITS-PLANiTS_2020.03.29/ITS2.SINTAX_format-kls2.fas"

# extract just Family and Genus names from rbcL and ITS2 databases
ref_rbcL <- Biostrings::fasta.index(rbcL.ref.tax)
ref_rbcL_names <- as.data.frame(ref_rbcL$desc); colnames(ref_rbcL_names) <- "names"
ref_rbcL_split_names <- ref_rbcL_names %>% 
  separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null")) %>% 
  select(-null) %>% 
  mutate(Family=str_extract(Family, "(?<=f__)[a-zA-Z\\s]+(?=_)"),
    Genus=str_extract(Genus, "(?<=g__)[a-zA-Z\\s]+(?=_)"), 
    Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+(?=_)"), 
    Barcode="rbcL") %>%
  mutate(Family = # correcting out-dated taxonomic names inherited from rbcL reference database (affecting rbcL samples only)
           ifelse(.data$Family=="Aceraceae", "Sapindaceae", #change Acer../Hipp.. to Sapi..
                  ifelse(.data$Family=="Hippocastanaceae", "Sapindaceae",
                         ifelse(.data$Genus=="Nyssa", "Nyssaceae", #change Corn.. to Nyss (I only have one species from Cornaceae in my data, Nyssa sylvatica, which is in Nyssaceae family) link: https://doi.org/10.1111/boj.12385
                         .data$Family)))) %>%
  mutate(FamGen=paste(Family,Genus,sep = "_")) %>%
  group_by(Barcode, Family, FamGen, Genus) %>%
  summarise(n=n_distinct(Species)) %>%
  filter(!str_ends(FamGen,"_NA")) %>%
  filter(!str_ends(FamGen,"_undef"))
## `summarise()` has grouped output by 'Barcode', 'Family', 'FamGen'. You can
## override using the `.groups` argument.
rm(ref_rbcL, ref_rbcL_names)


ref_its2<-Biostrings::fasta.index(PLANiTS.sntx.kls)
ref_its2_names<-as.data.frame(ref_its2$desc); colnames(ref_its2_names) <- "names"
ref_its2_split_names <- ref_its2_names %>% 
  separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null"), too_many = "merge") %>% 
  select(-null) %>% 
  mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
    Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"), 
    Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"),
    FamGen=paste(Family,Genus,sep = "_"),
    Barcode="ITS2") %>%
  group_by(Barcode, Family, FamGen, Genus) %>%
  summarise(n=n_distinct(Species)) %>%
  filter(!str_ends(FamGen,"_NA"))
## `summarise()` has grouped output by 'Barcode', 'Family', 'FamGen'. You can
## override using the `.groups` argument.
rm(ref_its2, ref_its2_names)


# NOTE: ITS2 db has several Genera listed in 2 Families:
# ref_its2_names %>% 
#   separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null"), too_many = "merge") %>% 
#   select(-null) %>% 
#   mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
#     Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"), 
#     Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"),
#     FamGen=paste(Family,Genus,sep = "_"),
#     Barcode="ITS2") %>%
#   group_by(Genus) %>% summarize(n_Family=n_distinct(Family)) %>% filter(n_Family!=1)

# NOTE CONT: but none of these Genera appear in my data
# (ref_its2_names %>% 
#   separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null"), too_many = "merge") %>% 
#   select(-null) %>% 
#   mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
#     Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"), 
#     Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"),
#     FamGen=paste(Family,Genus,sep = "_"),
#     Barcode="ITS2") %>%
#   group_by(Genus) %>% summarize(n_Family=n_distinct(Family)) %>% filter(n_Family!=1))$Genus %in% PollenIDs$Genus


# create list of Family-Genera detected in rbcL and ITS2 samples separately
rbcL_assigns<-PollenIDs %>%
  group_by(Barcode) %>%
  mutate(FamGen=paste0(Family,"_",Genus)) %>%
  filter(!str_ends(FamGen,"_NA")) %>%
  filter(Barcode=="rbcL") %>%
  ungroup() %>%
  select(Family, FamGen,Genus) %>% 
  distinct()

ITS2_assigns<-PollenIDs %>%
  group_by(Barcode) %>%
  mutate(FamGen=paste0(Family,"_",Genus)) %>%
  filter(!str_ends(FamGen,"_NA")) %>%
  filter(Barcode=="ITS2") %>%
  ungroup() %>%
  select(Family, FamGen,Genus) %>% 
  distinct()

ref_assigns<-PollenIDs %>%
  mutate(FamGen=paste0(Family,"_",Genus)) %>%
  filter(!str_ends(FamGen,"_NA")) %>%
  select(Family, FamGen,Genus) %>% 
  distinct()

knitr::kable(PollenIDs$Family %>% as_tibble() %>% distinct()) # a list of all Families detected in my data
value
Rosaceae
Caprifoliaceae
Fabaceae
NA
Paulowniaceae
Pinaceae
Platanaceae
Salicaceae
Adoxaceae
Altingiaceae
Brassicaceae
Elaeagnaceae
Lamiaceae
Poaceae
Violaceae
Boraginaceae
Fagaceae
Papaveraceae
Moraceae
Polemoniaceae
Cupressaceae
Plantaginaceae
Asteraceae
Sapindaceae
Annonaceae
Apocynaceae
Berberidaceae
Staphyleaceae
Lardizabalaceae
Magnoliaceae
Ginkgoaceae
Styracaceae
Oleaceae
Ranunculaceae
Amaryllidaceae
Nyssaceae
Celastraceae
Vitaceae
Buxaceae
Gelsemiaceae
Hyacinthaceae
Ericaceae
Anacardiaceae
Balsaminaceae
Cornaceae
 # venn diagram of Family-Genus entries in the ITS2 and rbcL dataBASES, and detected Family-Genera 
venn_refs<-gplots::venn(list(
  ref_its2_split_names$FamGen, # A
  ref_rbcL_split_names$FamGen,  # B
  ref_assigns$FamGen), intersections = TRUE) # C

attr(gplots::venn(list( # similar to above but at the Family level. There are 0 Families detected by just a single dataset
  ref_its2_split_names$Family %>% unique(), # A
  ref_rbcL_split_names$Family %>% unique(), # B
  ref_assigns$Family), intersections = TRUE), "intersections") # C

## $A
##   [1] "Acrobolbaceae"        "Actinochloridaceae"   "Adelanthaceae"       
##   [4] "Allisoniaceae"        "Amblystegiaceae"      "Anastrophyllaceae"   
##   [7] "Aneuraceae"           "Anomodontaceae"       "Antheliaceae"        
##  [10] "Aphanochaetaceae"     "Asphodelaceae"        "Aspleniaceae"        
##  [13] "Asteromonadaceae"     "Astrephomenaceae"     "Aulacomniaceae"      
##  [16] "Balanophoraceae"      "Barrancaceae"         "Bartramiaceae"       
##  [19] "Bathycoccaceae"       "Blechnaceae"          "Blepharostomataceae" 
##  [22] "Brachytheciaceae"     "Bracteacoccaceae"     "Brevianthaceae"      
##  [25] "Bruchiaceae"          "Bryaceae"             "Bryobartramiaceae"   
##  [28] "Bryoxiphiaceae"       "Callicladiaceae"      "Calliergonaceae"     
##  [31] "Calymperaceae"        "Calypogeiaceae"       "Catagoniaceae"       
##  [34] "Caulerpaceae"         "Cephaloziaceae"       "Cephaloziellaceae"   
##  [37] "Chaetopeltidaceae"    "Chaetophoraceae"      "Characeae"           
##  [40] "Chenopodiaceae"       "Chlamydomonadaceae"   "Chlorellaceae"       
##  [43] "Chlorococcaceae"      "Chlorodendraceae"     "Chloropicaceae"      
##  [46] "Chromochloridaceae"   "Cladophoraceae"       "Climaciaceae"        
##  [49] "Cloniophoraceae"      "Coldeniaceae"         "Corbichoniaceae"     
##  [52] "Corsiaceae"           "Crustomastigaceae"    "Cryphaeaceae"        
##  [55] "Cyatheaceae"          "Cyathodiaceae"        "Cynomoriaceae"       
##  [58] "Daltoniaceae"         "Delavayellaceae"      "Dendrocerotaceae"    
##  [61] "Desmidiaceae"         "Dicksoniaceae"        "Dicranaceae"         
##  [64] "Dictyochloridaceae"   "Dictyococcaceae"      "Disceliaceae"        
##  [67] "Ditrichaceae"         "Dolichomastigaceae"   "Drummondiaceae"      
##  [70] "Dryopteridaceae"      "Dumortieraceae"       "Dunaliellaceae"      
##  [73] "Echinodiaceae"        "Encalyptaceae"        "Entodontaceae"       
##  [76] "Equisetaceae"         "Fabroniaceae"         "Fissidentaceae"      
##  [79] "Flatbergiaceae"       "Fontinalaceae"        "Fossombroniaceae"    
##  [82] "Francoaceae"          "Fritschiellaceae"     "Frullaniaceae"       
##  [85] "Funariaceae"          "Gigaspermaceae"       "Grimmiaceae"         
##  [88] "Gymnomitriaceae"      "Haematococcaceae"     "Halimedaceae"        
##  [91] "Hedwigiaceae"         "Helodiaceae"          "Herbertaceae"        
##  [94] "Hookeriaceae"         "Hydrodictyaceae"      "Hylocomiaceae"       
##  [97] "Hymenophytaceae"      "Hypnaceae"            "Hypnodendraceae"     
## [100] "Hypopterygiaceae"     "Isoetaceae"           "Jamesoniellaceae"    
## [103] "Jocheniaceae"         "Jubulaceae"           "Jungermanniaceae"    
## [106] "Klebsormidiaceae"     "Koliellaceae"         "Kornmanniaceae"      
## [109] "Leeaceae"             "Lejeuneaceae"         "Lembophyllaceae"     
## [112] "Lennoaceae"           "Lepidolaenaceae"      "Lepidoziaceae"       
## [115] "Leptodontaceae"       "Leptosiraceae"        "Lepyrodontaceae"     
## [118] "Leskeaceae"           "Leucobryaceae"        "Leucodontaceae"      
## [121] "Leucomiaceae"         "Lindsaeaceae"         "Lophocoleaceae"      
## [124] "Lophoziaceae"         "Lycopodiaceae"        "Lygodiaceae"         
## [127] "Mamiellaceae"         "Marchantiaceae"       "Marsileaceae"        
## [130] "Mastigophoraceae"     "Meesiaceae"           "Meteoriaceae"        
## [133] "Metzgeriaceae"        "Microteaceae"         "Miyabeaceae"         
## [136] "Mniaceae"             "Moerckiaceae"         "Monocleaceae"        
## [139] "Monomastigaceae"      "Monostromataceae"     "Mychonastaceae"      
## [142] "Myliaceae"            "Myriniaceae"          "Mystropetalaceae"    
## [145] "Myuriaceae"           "NA"                   "Namaceae"            
## [148] "Neckeraceae"          "Neochloridaceae"      "Notothyladaceae"     
## [151] "Onocleaceae"          "Oocystaceae"          "Orthodontiaceae"     
## [154] "Orthotrichaceae"      "Pallaviciniaceae"     "Pedinomonadaceae"    
## [157] "Pelliaceae"           "Petiveriaceae"        "Phacotaceae"         
## [160] "Phyllogoniaceae"      "Pilotrichaceae"       "Pithophoraceae"      
## [163] "Plagiochilaceae"      "Plagiotheciaceae"     "Polypodiaceae"       
## [166] "Polytrichaceae"       "Porellaceae"          "Pottiaceae"          
## [169] "Prasiolaceae"         "Prionodontaceae"      "Protosiphonaceae"    
## [172] "Pseudocladophoraceae" "Pseudomuriellaceae"   "Psilotaceae"         
## [175] "Pteridaceae"          "Pterigynandraceae"    "Pterobryaceae"       
## [178] "Ptilidiaceae"         "Ptychomniaceae"       "Pylaisiaceae"        
## [181] "Pylaisiadelphaceae"   "Radiococcaceae"       "Radulaceae"          
## [184] "Regmatodontaceae"     "Rhabdoweisiaceae"     "Rhacocarpaceae"      
## [187] "Rhizogoniaceae"       "Rhytidiaceae"         "Ricciaceae"          
## [190] "Rotundellaceae"       "Rutenbergiaceae"      "Saelaniaceae"        
## [193] "Salviniaceae"         "Saulomataceae"        "Scapaniaceae"        
## [196] "Scenedesmaceae"       "Schimperobryaceae"    "Schizochlamydaceae"  
## [199] "Schizomeridaceae"     "Schroederiaceae"      "Scouleriaceae"       
## [202] "Selaginellaceae"      "Selenastraceae"       "Seligeriaceae"       
## [205] "Sematophyllaceae"     "Siphonocladaceae"     "Solenostomataceae"   
## [208] "Sphaeropleaceae"      "Sphagnaceae"          "Splachnaceae"        
## [211] "Spondylomoraceae"     "Stereodontaceae"      "Stereophyllaceae"    
## [214] "Symphyodontaceae"     "Taccaceae"            "Takakiaceae"         
## [217] "Taxiphyllaceae"       "Tetrabaenaceae"       "Tetraphidaceae"      
## [220] "Theliaceae"           "Thismiaceae"          "Thuidiaceae"         
## [223] "Trachylomataceae"     "Trebouxiaceae"        "Trentepohliaceae"    
## [226] "Tumidellaceae"        "Tupiellaceae"         "Udoteaceae"          
## [229] "Ulotrichaceae"        "Ulvaceae"             "Ulvellaceae"         
## [232] "Uronemataceae"        "Volvocaceae"          "Zygnemataceae"       
## 
## $B
##   [1] "Achatocarpaceae"    "Aextoxicaceae"      "Amborellaceae"     
##   [4] "Anarthriaceae"      "Aphloiaceae"        "Asteliaceae"       
##   [7] "Asteropeiaceae"     "Atherospermataceae" "Balanopaceae"      
##  [10] "Barbeuiaceae"       "Barbeyaceae"        "Bataceae"          
##  [13] "Berberidopsidaceae" "Biebersteiniaceae"  "Blandfordiaceae"   
##  [16] "Borthwickiaceae"    "Brunelliaceae"      "Campynemataceae"   
##  [19] "Cardiopteridaceae"  "Centrolepidaceae"   "Centroplacaceae"   
##  [22] "Cephalotaceae"      "Columelliaceae"     "Ctenolophonaceae"  
##  [25] "Curtisiaceae"       "Cyclanthaceae"      "Dasypogonaceae"    
##  [28] "Degeneriaceae"      "Dipentodontaceae"   "Dirachmaceae"      
##  [31] "Doryanthaceae"      "Ecdeiocoleaceae"    "Emblingiaceae"     
##  [34] "Escalloniaceae"     "Eupomatiaceae"      "Geissolomataceae"  
##  [37] "Gerrardinaceae"     "Goupiaceae"         "Grubbiaceae"       
##  [40] "Guamatelaceae"      "Haemodoraceae"      "Hernandiaceae"     
##  [43] "Himantandraceae"    "Huaceae"            "Hydrostachyaceae"  
##  [46] "Irvingiaceae"       "Ixonanthaceae"      "Koeberliniaceae"   
##  [49] "Lacistemataceae"    "Lactoridaceae"      "Lanariaceae"       
##  [52] "Lepidobotryaceae"   "Lophopyxidaceae"    "Macarthuriaceae"   
##  [55] "Mayacaceae"         "Melianthaceae"      "Montiniaceae"      
##  [58] "Myrothamnaceae"     "Nanodeaceae"        "Octoknemaceae"     
##  [61] "Olacaceae"          "Oncothecaceae"      "Pandaceae"         
##  [64] "Paracryphiaceae"    "Pentaphragmataceae" "Peridiscaceae"     
##  [67] "Petenaeaceae"       "Petermanniaceae"    "Petrosaviaceae"    
##  [70] "Phellinaceae"       "Philydraceae"       "Physenaceae"       
##  [73] "Plocospermataceae"  "Posidoniaceae"      "Quillajaceae"      
##  [76] "Rehmanniaceae"      "Restionaceae"       "Rhabdodendraceae"  
##  [79] "Schlegeliaceae"     "Setchellanthaceae"  "Simmondsiaceae"    
##  [82] "Sphaerosepalaceae"  "Sphenocleaceae"     "Stemonaceae"       
##  [85] "Strasburgeriaceae"  "Strombosiaceae"     "Surianaceae"       
##  [88] "Tecophilaeaceae"    "Thurniaceae"        "Torricelliaceae"   
##  [91] "Trigoniaceae"       "Trimeniaceae"       "Vahliaceae"        
##  [94] "Velloziaceae"       "Vivianiaceae"       "Welwitschiaceae"   
##  [97] "Xanthoceraceae"     "Xanthorrhoeaceae"   "Xeronemataceae"    
## [100] "undef"             
## 
## $`A:B`
##   [1] "Acanthaceae"        "Achariaceae"        "Acoraceae"         
##   [4] "Actinidiaceae"      "Agdestidaceae"      "Aizoaceae"         
##   [7] "Akaniaceae"         "Alismataceae"       "Alseuosmiaceae"    
##  [10] "Alstroemeriaceae"   "Alzateaceae"        "Amaranthaceae"     
##  [13] "Amphorogynaceae"    "Anacampserotaceae"  "Ancistrocladaceae" 
##  [16] "Anisophylleaceae"   "Aphanopetalaceae"   "Apiaceae"          
##  [19] "Aponogetonaceae"    "Aptandraceae"       "Aquifoliaceae"     
##  [22] "Araceae"            "Araliaceae"         "Araucariaceae"     
##  [25] "Arecaceae"          "Argophyllaceae"     "Aristolochiaceae"  
##  [28] "Asparagaceae"       "Austrobaileyaceae"  "Basellaceae"       
##  [31] "Begoniaceae"        "Betulaceae"         "Bignoniaceae"      
##  [34] "Bixaceae"           "Bonnetiaceae"       "Boryaceae"         
##  [37] "Bromeliaceae"       "Bruniaceae"         "Burmanniaceae"     
##  [40] "Burseraceae"        "Butomaceae"         "Byblidaceae"       
##  [43] "Cabombaceae"        "Cactaceae"          "Calceolariaceae"   
##  [46] "Calophyllaceae"     "Calycanthaceae"     "Calyceraceae"      
##  [49] "Campanulaceae"      "Canellaceae"        "Cannabaceae"       
##  [52] "Cannaceae"          "Capparaceae"        "Caricaceae"        
##  [55] "Carlemanniaceae"    "Caryocaraceae"      "Caryophyllaceae"   
##  [58] "Casuarinaceae"      "Ceratophyllaceae"   "Cercidiphyllaceae" 
##  [61] "Cervantesiaceae"    "Chloranthaceae"     "Chrysobalanaceae"  
##  [64] "Circaeasteraceae"   "Cistaceae"          "Cleomaceae"        
##  [67] "Clethraceae"        "Clusiaceae"         "Codonaceae"        
##  [70] "Colchicaceae"       "Comandraceae"       "Combretaceae"      
##  [73] "Commelinaceae"      "Connaraceae"        "Convolvulaceae"    
##  [76] "Cordiaceae"         "Coriariaceae"       "Corynocarpaceae"   
##  [79] "Costaceae"          "Coulaceae"          "Crassulaceae"      
##  [82] "Crossosomataceae"   "Crypteroniaceae"    "Cucurbitaceae"     
##  [85] "Cunoniaceae"        "Cycadaceae"         "Cymodoceaceae"     
##  [88] "Cyperaceae"         "Cyrillaceae"        "Daphniphyllaceae"  
##  [91] "Datiscaceae"        "Diapensiaceae"      "Dichapetalaceae"   
##  [94] "Didiereaceae"       "Dilleniaceae"       "Dioncophyllaceae"  
##  [97] "Dioscoreaceae"      "Dipterocarpaceae"   "Droseraceae"       
## [100] "Drosophyllaceae"    "Ebenaceae"          "Ehretiaceae"       
## [103] "Elaeocarpaceae"     "Elatinaceae"        "Ephedraceae"       
## [106] "Eriocaulaceae"      "Erythropalaceae"    "Erythroxylaceae"   
## [109] "Eucommiaceae"       "Euphorbiaceae"      "Euphroniaceae"     
## [112] "Eupteleaceae"       "Flagellariaceae"    "Fouquieriaceae"    
## [115] "Frankeniaceae"      "Garryaceae"         "Gentianaceae"      
## [118] "Geraniaceae"        "Gesneriaceae"       "Gisekiaceae"       
## [121] "Gnetaceae"          "Gomortegaceae"      "Goodeniaceae"      
## [124] "Griseliniaceae"     "Grossulariaceae"    "Gunneraceae"       
## [127] "Gyrostemonaceae"    "Halophytaceae"      "Haloragaceae"      
## [130] "Hamamelidaceae"     "Hanguanaceae"       "Heliconiaceae"     
## [133] "Heliotropiaceae"    "Helwingiaceae"      "Humiriaceae"       
## [136] "Hydatellaceae"      "Hydrangeaceae"      "Hydrocharitaceae"  
## [139] "Hydroleaceae"       "Hydrophyllaceae"    "Hypericaceae"      
## [142] "Hypoxidaceae"       "Icacinaceae"        "Iridaceae"         
## [145] "Iteaceae"           "Ixioliriaceae"      "Joinvilleaceae"    
## [148] "Juglandaceae"       "Juncaceae"          "Juncaginaceae"     
## [151] "Kewaceae"           "Krameriaceae"       "Lauraceae"         
## [154] "Lecythidaceae"      "Lentibulariaceae"   "Liliaceae"         
## [157] "Limeaceae"          "Limnanthaceae"      "Linaceae"          
## [160] "Linderniaceae"      "Loasaceae"          "Loganiaceae"       
## [163] "Lophiocarpaceae"    "Loranthaceae"       "Lowiaceae"         
## [166] "Lythraceae"         "Malpighiaceae"      "Malvaceae"         
## [169] "Marantaceae"        "Marcgraviaceae"     "Martyniaceae"      
## [172] "Maundiaceae"        "Mazaceae"           "Melanthiaceae"     
## [175] "Melastomataceae"    "Meliaceae"          "Menispermaceae"    
## [178] "Menyanthaceae"      "Metteniusaceae"     "Misodendraceae"    
## [181] "Molluginaceae"      "Monimiaceae"        "Montiaceae"        
## [184] "Moringaceae"        "Muntingiaceae"      "Musaceae"          
## [187] "Myodocarpaceae"     "Myricaceae"         "Myristicaceae"     
## [190] "Myrtaceae"          "Nartheciaceae"      "Nelumbonaceae"     
## [193] "Nepenthaceae"       "Neuradaceae"        "Nitrariaceae"      
## [196] "Nothofagaceae"      "Nyctaginaceae"      "Nymphaeaceae"      
## [199] "Ochnaceae"          "Onagraceae"         "Opiliaceae"        
## [202] "Orchidaceae"        "Orobanchaceae"      "Oxalidaceae"       
## [205] "Paeoniaceae"        "Pandanaceae"        "Passifloraceae"    
## [208] "Pedaliaceae"        "Penaeaceae"         "Pennantiaceae"     
## [211] "Pentaphylacaceae"   "Penthoraceae"       "Peraceae"          
## [214] "Philesiaceae"       "Phrymaceae"         "Phyllanthaceae"    
## [217] "Phyllonomaceae"     "Phytolaccaceae"     "Picramniaceae"     
## [220] "Picrodendraceae"    "Piperaceae"         "Pittosporaceae"    
## [223] "Plumbaginaceae"     "Podocarpaceae"      "Podostemaceae"     
## [226] "Polygalaceae"       "Polygonaceae"       "Pontederiaceae"    
## [229] "Portulacaceae"      "Potamogetonaceae"   "Primulaceae"       
## [232] "Proteaceae"         "Putranjivaceae"     "Rapateaceae"       
## [235] "Resedaceae"         "Rhamnaceae"         "Rhizophoraceae"    
## [238] "Ripogonaceae"       "Roridulaceae"       "Rousseaceae"       
## [241] "Rubiaceae"          "Rutaceae"           "Sabiaceae"         
## [244] "Salvadoraceae"      "Santalaceae"        "Sapotaceae"        
## [247] "Sarcobataceae"      "Sarcolaenaceae"     "Sarraceniaceae"    
## [250] "Saururaceae"        "Saxifragaceae"      "Scheuchzeriaceae"  
## [253] "Schisandraceae"     "Schoepfiaceae"      "Sciadopityaceae"   
## [256] "Scrophulariaceae"   "Simaroubaceae"      "Siparunaceae"      
## [259] "Sladeniaceae"       "Smilacaceae"        "Solanaceae"        
## [262] "Stachyuraceae"      "Stegnospermataceae" "Stemonuraceae"     
## [265] "Stilbaceae"         "Strelitziaceae"     "Stylidiaceae"      
## [268] "Symplocaceae"       "Talinaceae"         "Tamaricaceae"      
## [271] "Tapisciaceae"       "Taxaceae"           "Tetracarpaeaceae"  
## [274] "Tetrachondraceae"   "Tetramelaceae"      "Tetrameristaceae"  
## [277] "Theaceae"           "Thesiaceae"         "Thomandersiaceae"  
## [280] "Thymelaeaceae"      "Ticodendraceae"     "Tofieldiaceae"     
## [283] "Tovariaceae"        "Triuridaceae"       "Trochodendraceae"  
## [286] "Tropaeolaceae"      "Typhaceae"          "Ulmaceae"          
## [289] "Urticaceae"         "Verbenaceae"        "Viscaceae"         
## [292] "Vochysiaceae"       "Wellstediaceae"     "Winteraceae"       
## [295] "Ximeniaceae"        "Xyridaceae"         "Zamiaceae"         
## [298] "Zingiberaceae"      "Zosteraceae"        "Zygophyllaceae"    
## 
## $`A:B:C`
##  [1] "Adoxaceae"       "Altingiaceae"    "Amaryllidaceae"  "Anacardiaceae"  
##  [5] "Annonaceae"      "Apocynaceae"     "Asteraceae"      "Balsaminaceae"  
##  [9] "Berberidaceae"   "Boraginaceae"    "Brassicaceae"    "Buxaceae"       
## [13] "Caprifoliaceae"  "Celastraceae"    "Cornaceae"       "Cupressaceae"   
## [17] "Elaeagnaceae"    "Ericaceae"       "Fabaceae"        "Fagaceae"       
## [21] "Gelsemiaceae"    "Ginkgoaceae"     "Hyacinthaceae"   "Lamiaceae"      
## [25] "Lardizabalaceae" "Magnoliaceae"    "Moraceae"        "Nyssaceae"      
## [29] "Oleaceae"        "Papaveraceae"    "Paulowniaceae"   "Pinaceae"       
## [33] "Plantaginaceae"  "Platanaceae"     "Poaceae"         "Polemoniaceae"  
## [37] "Ranunculaceae"   "Rosaceae"        "Salicaceae"      "Sapindaceae"    
## [41] "Staphyleaceae"   "Styracaceae"     "Violaceae"       "Vitaceae"
attr(venn_refs, "intersections")[["A:C"]] #Genus present in ITS2 db & detected, but not in rbcL db (ONE TAXON)
## [1] "Brassicaceae_Mummenhoffia"
attr(venn_refs, "intersections")[["B:C"]] #Genus present in rbcL db & detected, but not in ITS2 db (NONE)
## NULL
# the two datasets were capable of detecting virtually all the same taxa

# VennDiagram::venn.diagram( #making a *pretty* diagram (same info as three-way venn above)
#   list(ref_its2_split_names$FamGen,
#   ref_rbcL_split_names$FamGen,
#   ref_assigns$FamGen),
# #  main="Genera Shared Between Reference Databases",
# #  main.pos = c(0.5,0.95),
# #  main.fontfamily = "sans",
# #  main.fontface = "plain",
#   fontfamily = c(rep("sans",7)),
#   fontface = c(rep("plain",3), rep("bold",4)),
#   cex = c(rep(2,3), rep(3,4)),
#   cat.fontface = c("plain","plain","bold"),
#   cat.fontfamily=c("sans","sans", "sans"),
#   category.names = c("ITS2 reference \n database", "rbcL reference \n database", "detected \n in this project"),
#   fill = c("#F8766D", "#00BFC4", "white"),
#   alpha = c(0.5, 0.5, 0),
#   lwd = c(0, 0, 2),
#   cat.pos=c(25, 335,180), # these are ordered: rbcL, ITS, detected (because i flipped the plot 180 below)
#   cat.dist=c(0.08,0.08, -0.04),
#   rotation.degree=180,
#   "Figure_5_left.tiff") #saves to the current working directory

# venn diagram of Genus assignments for the ITS2 and rbcL dataSETS
venn_assigns<-gplots::venn(
  list(ITS2_assigns$Genus,
       rbcL_assigns$Genus), intersections = TRUE)

attr(venn_assigns, "intersections")[["A"]] #22 genera detected only in ITS2 samples
##  [1] "Mertensia"    "Taraxacum"    "Cardamine"    "Buglossoides" "Mummenhoffia"
##  [6] "Exochorda"    "Packera"      "Halesia"      "Geum"         "Thlaspi"     
## [11] "Crataegus"    "Erigeron"     "Leucojum"     "Chelidonium"  "Photinia"    
## [16] "Fontanesia"   "Celastrus"    "Malus"        "Fagus"        "Impatiens"   
## [21] "Gaylussacia"  "Anemone"
attr(venn_assigns, "intersections")[["A:B"]] #29 genera detected by both
##  [1] "Prunus"       "Quercus"      "Dicentra"     "Lamium"       "Polemonium"  
##  [6] "Viola"        "Potentilla"   "Medicago"     "Cercis"       "Acer"        
## [11] "Robinia"      "Platanus"     "Podophyllum"  "Viburnum"     "Ranunculus"  
## [16] "Syringa"      "Paulownia"    "Hesperis"     "Aesculus"     "Rosa"        
## [21] "Pulmonaria"   "Lamprocapnos" "Stylophorum"  "Aronia"       "Pyrus"       
## [26] "Vicia"        "Berberis"     "Vaccinium"    "Spiraea"
attr(venn_assigns, "intersections")[["B"]] #43 genera detected only in rbcL samples
##  [1] "Lonicera"      "Pinus"         "Salix"         "Liquidambar"  
##  [5] "Brassica"      "Elaeagnus"     "Glechoma"      "Anthoxanthum" 
##  [9] "Lupinus"       "Morus"         "Chamaecyparis" "Veronica"     
## [13] "Asimina"       "Vinca"         "Cunninghamia"  "Lolium"       
## [17] "Poa"           "Cynoglossum"   "Euscaphis"     "Akebia"       
## [21] "Magnolia"      "Trifolium"     "Ginkgo"        "Picea"        
## [25] "Barbarea"      "Fraxinus"      "Dactylis"      "Wisteria"     
## [29] "Virgilia"      "Liriodendron"  "Nyssa"         "Gleditsia"    
## [33] "Vitis"         "Securigera"    "Pachysandra"   "Gelsemium"    
## [37] "Scilla"        "Toxicodendron" "Ligustrum"     "Cornus"       
## [41] "Alliaria"      "Rubus"         "Isatis"        "Phleum"
# although the two databases contained the same genera detected in my study, certain detected genera only appeared in one dataset or the other


# venn diagram of Genus assignments for the ITS2 and rbcL dataSETS, and most common genera
venn_assigns_common<-gplots::venn(
  list(ITS2_assigns$Genus,
       rbcL_assigns$Genus,
       topGenera2$Genus), intersections = TRUE)

Genera_ITS2<-attr(venn_assigns_common, "intersections")[["A:C"]] #4 common genera detected only in ITS2 samples
Genera_Both<-attr(venn_assigns_common, "intersections")[["A:B:C"]] #9 common genera detected by both
Genera_rbcL<-attr(venn_assigns_common, "intersections")[["B:C"]] #7 common genera detected only in rbcL samples

# VennDiagram::venn.diagram( #making a *pretty* diagram (same info as three-way venn above)
#   list(ITS2_assigns$Genus,
#        rbcL_assigns$Genus,
#        topGenera2$Genus),
# #  main="Genera Shared Between Barcode Datasets",
# #  main.pos = c(0.5,0.95),
# #  main.fontfamily = "sans",
# #  main.fontface = "plain",
#   fontfamily = c(rep("sans",7)),
#   fontface = c(rep("plain",3), rep("bold",4)),
#   cex = c(rep(2,3), rep(3,4)),
#   cat.fontface = c("plain","plain","bold"),
#   cat.fontfamily=c("sans","sans","sans"),
#   category.names = c("genera detected in \n ITS2 samples", "genera detected in \n rbcL samples", "most common genera"),
#   fill = c("#F8766D", "#00BFC4", "white"),
#   alpha = c(0.5, 0.5, 0),
#   lwd = c(0, 0, 2),
#  # lty = c(1,1,1),
#  # col = c("black","black","white"),
#  # cat.cex=c(2.5,2.5,0),
#   cat.pos=c(335, 25, 180),
#   cat.dist=c(0.08, 0.08, -0.2),
#   "Figure_5_right.tiff") #saves to the current working directory

# grid::grid.raster(tiff::readTIFF("file_name.tiff")) # view the plot



# use some lists and functions to return a table with counts of intersections
list<-list(
    most_common=topGenera2$Genus, 
    its2_detect=ITS2_assigns$Genus,
    rbcL_detect=rbcL_assigns$Genus
    )
combs <- unlist(lapply(1:length(list), 
                function(j) combn(names(list), j, simplify = FALSE)),
         recursive = FALSE)
names(combs) <- sapply(combs, function(i) paste0(i, collapse = ""))
str(combs)
## List of 7
##  $ most_common                      : chr "most_common"
##  $ its2_detect                      : chr "its2_detect"
##  $ rbcL_detect                      : chr "rbcL_detect"
##  $ most_commonits2_detect           : chr [1:2] "most_common" "its2_detect"
##  $ most_commonrbcL_detect           : chr [1:2] "most_common" "rbcL_detect"
##  $ its2_detectrbcL_detect           : chr [1:2] "its2_detect" "rbcL_detect"
##  $ most_commonits2_detectrbcL_detect: chr [1:3] "most_common" "its2_detect" "rbcL_detect"
Intersect <- function (x) {  
  # Multiple set version of intersect
  # x is a list
  if (length(x) == 1) {
    unlist(x)
  } else if (length(x) == 2) {
    intersect(x[[1]], x[[2]])
  } else if (length(x) > 2){
    intersect(x[[1]], Intersect(x[-1]))
  }
}
Union <- function (x) {  
  # Multiple set version of union
  # x is a list
  if (length(x) == 1) {
    unlist(x)
  } else if (length(x) == 2) {
    union(x[[1]], x[[2]])
  } else if (length(x) > 2) {
    union(x[[1]], Union(x[-1]))
  }
}
Setdiff <- function (x, y) {
  # Remove the union of the y's from the common x's. 
  # x and y are lists of characters.
  xx <- Intersect(x)
  yy <- Union(y)
  setdiff(xx, yy)
}
elements <- 
  lapply(combs, function(i) Setdiff(list[i], list[setdiff(names(list), i)]))
n.elements <- sapply(elements, length)
print(n.elements)
##                       most_common                       its2_detect 
##                                 0                                18 
##                       rbcL_detect            most_commonits2_detect 
##                                37                                 4 
##            most_commonrbcL_detect            its2_detectrbcL_detect 
##                                 7                                19 
## most_commonits2_detectrbcL_detect 
##                                10
#from lines list<-list() ...to... print(n.elements): this code was largely copied from https://stackoverflow.com/questions/23559371/get-the-list-of-items-in-venn-diagram

rm(list, combs, Intersect, Union, Setdiff, elements, n.elements)

rm(rbcL_assigns, ITS2_assigns, ref_assigns)

Proportion of reads assigned to genus by barcode

#plot prop reads assigned to the top 21 genera by barcode


ggplot()+
  geom_col(data=PollenIDs %>% mutate(FamGen=paste(Family, Genus, sep=" ")),
           aes(x=Barcode, y=PropReads,
           fill = ifelse(.data$Genus %in% topGenera2$Genus, FamGen, 
                        ifelse(!is.na(.data$Genus), "z_AllOther", NA))), 
           position="fill")+
  geom_label(data=PollenIDs %>% group_by(Barcode) %>% summarize(nbees=n_distinct(BeeID)),
             aes(x=Barcode, y=-0.05, label=paste("n = ", nbees, sep="")))+
  scale_fill_manual(name="FamGen", values = myColors.FamGen, na.value = "gray")+
  guides(fill=guide_legend(title = ""))+
  labs(y="Proportion of Reads") +
  theme_bw(base_size = 16)+
  theme(legend.position = "bottom",
        legend.text = element_text(size=10),
        legend.box.just = "left")

# prop reads assigned to top genera UNIQUE to each barcode 
PollenIDs %>%
  mutate(FamGen=paste(Family, Genus, sep=" ")) %>%
ggplot(aes(x=Barcode,
           y=PropReads,
           fill = ifelse(.data$Genus %in% Genera_Both, "z_Shared", 
                         ifelse(.data$Genus %in% topGenera2$Genus, FamGen, 
                                ifelse(!is.na(.data$Genus), "z_AllOther", NA)))))+
  geom_col(position="fill")+
  scale_fill_manual(name="FamGen", values = myColors.FamGen, na.value = "gray")+ 
  guides(fill=guide_legend(title = "Family Genus"))+
  theme_bw()

#to plot side by side the prop reads assigned to Genus for the two barcodes
#creating a vector of sample names that have both rbcL and ITS2 data
both<-as.vector((PollenIDs %>% group_by(BeeID, Barcode) %>% summarise(BeeID=first(BeeID), Barcode=first(Barcode)) %>% group_by(BeeID) %>% summarise(Barcodes=n()) %>% 
filter(Barcodes==2))$BeeID) #64 pollen samples with rbcL and ITS2 data
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
dim(CapLog %>% select(BeeID, CarryPollen) %>% filter(CarryPollen==1)) # 130 queens carrying pollen
## [1] 130   2
dim(CapLog %>% select(BeeID, SampleID, CarryPollen) %>% filter(CarryPollen==1&SampleID!=0)) # 111 pollen samples collected
## [1] 111   3
dim(PollenIDs %>% select(BeeID) %>% distinct()) # 107 samples with pollen data
## [1] 107   1
#the proportion of reads assigned to genera, split up by each individual pollen sample/bee; the same colors means the two barcodes agree with pollen genera composition
PollenIDs %>%
 filter(BeeID %in% both) %>%
ggplot(aes(x=Barcode,y=PropReads,fill=Genus))+
  geom_col(position="fill")+
  facet_wrap(~BeeID)+
  theme(legend.position = "none")

# do the barcode-specific genera appear in samples with only one or the other barcode?

#among samples in which genera unique to ITS2 were detected, how many samples had data from one or both barcodes?
samples_with_ITS2Only_genera<-PollenIDs %>% filter(.data$Genus %in% Genera_ITS2) %>% distinct(BeeID)
PollenIDs %>% filter(.data$BeeID %in% samples_with_ITS2Only_genera$BeeID) %>% select(BeeID,Barcode) %>% group_by(BeeID) %>% summarize(NBarcodes=n_distinct(Barcode)) %>% group_by(NBarcodes) %>% summarize(NSamples=n_distinct(BeeID))
## # A tibble: 2 × 2
##   NBarcodes NSamples
##       <int>    <int>
## 1         1        6
## 2         2       40
samples_with_rbcLOnly_genera<-PollenIDs %>% filter(.data$Genus %in% Genera_rbcL) %>% distinct(BeeID)
PollenIDs %>% filter(.data$BeeID %in% samples_with_rbcLOnly_genera$BeeID) %>% select(BeeID,Barcode) %>% group_by(BeeID) %>% summarize(NBarcodes=n_distinct(Barcode)) %>% group_by(NBarcodes) %>% summarize(NSamples=n_distinct(BeeID))
## # A tibble: 2 × 2
##   NBarcodes NSamples
##       <int>    <int>
## 1         1       31
## 2         2       58
rm(samples_with_ITS2Only_genera, samples_with_rbcLOnly_genera)

Barcode gap?

# are barcode-specific genera due to a "diff-genera, same-family" type situation?

# taxonomic names in reference databases, for genera unique to each barcode
rbind(ref_its2_split_names, ref_rbcL_split_names)
## # A tibble: 18,159 × 5
## # Groups:   Barcode, Family, FamGen [18,159]
##    Barcode Family      FamGen                    Genus             n
##    <chr>   <chr>       <chr>                     <chr>         <int>
##  1 ITS2    Acanthaceae Acanthaceae_Acanthopale   Acanthopale       2
##  2 ITS2    Acanthaceae Acanthaceae_Acanthopsis   Acanthopsis       2
##  3 ITS2    Acanthaceae Acanthaceae_Acanthostelma Acanthostelma     1
##  4 ITS2    Acanthaceae Acanthaceae_Acanthus      Acanthus          5
##  5 ITS2    Acanthaceae Acanthaceae_Achyrocalyx   Achyrocalyx       1
##  6 ITS2    Acanthaceae Acanthaceae_Aechmanthera  Aechmanthera      1
##  7 ITS2    Acanthaceae Acanthaceae_Ancistranthus Ancistranthus     1
##  8 ITS2    Acanthaceae Acanthaceae_Andrographis  Andrographis      1
##  9 ITS2    Acanthaceae Acanthaceae_Angkalanthus  Angkalanthus      1
## 10 ITS2    Acanthaceae Acanthaceae_Anisacanthus  Anisacanthus     11
## # ℹ 18,149 more rows
rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2)
## # A tibble: 7 × 5
## # Groups:   Barcode, Family, FamGen [7]
##   Barcode Family       FamGen                    Genus            n
##   <chr>   <chr>        <chr>                     <chr>        <int>
## 1 ITS2    Asteraceae   Asteraceae_Taraxacum      Taraxacum       65
## 2 ITS2    Boraginaceae Boraginaceae_Mertensia    Mertensia        5
## 3 ITS2    Brassicaceae Brassicaceae_Mummenhoffia Mummenhoffia     1
## 4 ITS2    Styracaceae  Styracaceae_Halesia       Halesia          2
## 5 rbcL    Asteraceae   Asteraceae_Taraxacum      Taraxacum        9
## 6 rbcL    Boraginaceae Boraginaceae_Mertensia    Mertensia        1
## 7 rbcL    Styracaceae  Styracaceae_Halesia       Halesia          2
rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL)
## # A tibble: 14 × 5
## # Groups:   Barcode, Family, FamGen [14]
##    Barcode Family         FamGen                    Genus            n
##    <chr>   <chr>          <chr>                     <chr>        <int>
##  1 ITS2    Brassicaceae   Brassicaceae_Barbarea     Barbarea         3
##  2 ITS2    Caprifoliaceae Caprifoliaceae_Lonicera   Lonicera        52
##  3 ITS2    Elaeagnaceae   Elaeagnaceae_Elaeagnus    Elaeagnus        8
##  4 ITS2    Lamiaceae      Lamiaceae_Glechoma        Glechoma         5
##  5 ITS2    Magnoliaceae   Magnoliaceae_Liriodendron Liriodendron     1
##  6 ITS2    Pinaceae       Pinaceae_Pinus            Pinus           40
##  7 ITS2    Salicaceae     Salicaceae_Salix          Salix           20
##  8 rbcL    Brassicaceae   Brassicaceae_Barbarea     Barbarea         3
##  9 rbcL    Caprifoliaceae Caprifoliaceae_Lonicera   Lonicera        25
## 10 rbcL    Elaeagnaceae   Elaeagnaceae_Elaeagnus    Elaeagnus       10
## 11 rbcL    Lamiaceae      Lamiaceae_Glechoma        Glechoma         3
## 12 rbcL    Magnoliaceae   Magnoliaceae_Liriodendron Liriodendron     2
## 13 rbcL    Pinaceae       Pinaceae_Pinus            Pinus           91
## 14 rbcL    Salicaceae     Salicaceae_Salix          Salix           59
# what are the family names of the common genera detected by only one barcode or the other
PollenIDs %>% select(Barcode, Family, Genus) %>% arrange(Barcode, Family) %>% distinct() %>%
  filter(if_else(.data$Barcode=="ITS2", 
                 .data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL))$Family, 
                 .data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2))$Family))
##    Barcode       Family        Genus
## 1     ITS2 Brassicaceae    Cardamine
## 2     ITS2 Brassicaceae Mummenhoffia
## 3     ITS2 Brassicaceae      Thlaspi
## 4     ITS2 Brassicaceae     Hesperis
## 5     ITS2    Lamiaceae       Lamium
## 6     rbcL Boraginaceae  Cynoglossum
## 7     rbcL Boraginaceae   Pulmonaria
## 8     rbcL Brassicaceae     Brassica
## 9     rbcL Brassicaceae     Hesperis
## 10    rbcL Brassicaceae     Barbarea
## 11    rbcL Brassicaceae     Alliaria
## 12    rbcL Brassicaceae       Isatis
# which samples names have mertensia (a its2 unique genus) in them
PollenIDs %>% filter(Genus=="Brassica") %>% filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% select(BeeID)
##      BeeID
## 1    Bb004
## 2    Bb005
## 3    Bb011
## 4    Bb018
## 5    Bf001
## 6    Bg002
## 7    Bg003
## 8    Bg004
## 9    Bg009
## 10   Bg010
## 11   Bg011
## 12 KLS0163
## 13 KLS0201
## 14 KLS0205
temp1<-PollenIDs %>% 
       filter(.data$BeeID %in% (PollenIDs %>% 
                                  filter(.data$Genus %in% Genera_rbcL | .data$Genus %in% Genera_ITS2) %>% 
                                  select(BeeID) %>% 
                                  filter(.data$BeeID %in% TwoBarcodeBees$BeeID)
                                )$BeeID) %>% 
       filter(.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL))$Family |
              .data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2))$Family) %>%
       group_by(BeeID, Barcode, Family) %>% 
       summarise(BeeID=first(BeeID), Barcode=first(Barcode), Family=first(Family), Genus=first(Genus), SampleTotalReads=first(SampleTotalReads), Reads=sum(Reads), PropReads=sum(PropReads))
## `summarise()` has grouped output by 'BeeID', 'Barcode'. You can override using
## the `.groups` argument.
# in samples where a common-but-barcode-specific genus occurs, show me the total reads assigned to any genera of the same family
temp1 %>%
  ggplot(aes(y=PropReads, col=Barcode))+
  geom_boxplot()+
  facet_wrap(~Family)

temp2<-PollenIDs %>% 
       filter(.data$BeeID %in% (PollenIDs %>% 
                                  filter(.data$Genus %in% Genera_rbcL | .data$Genus %in% Genera_ITS2) %>% 
                                  select(BeeID) %>% 
                                  filter(.data$BeeID %in% TwoBarcodeBees$BeeID)
                                )$BeeID) %>% 
       filter(.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL))$Family |
              .data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2))$Family) %>%
       group_by(BeeID, Barcode, Family) %>% 
       summarise(BeeID=first(BeeID), Barcode=first(Barcode), Family=first(Family), Genus=first(Genus), SampleTotalReads=first(SampleTotalReads), Reads=sum(Reads), PropReads=sum(PropReads)) %>%
       group_by(BeeID, Barcode, Family) %>%
       summarise(sumPropReads=sum(PropReads)) %>%
       group_by(BeeID, Family) %>%
       reframe(BarcodeDiff=sumPropReads[Barcode=="ITS2"]-sumPropReads[Barcode=="rbcL"])
## `summarise()` has grouped output by 'BeeID', 'Barcode'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'BeeID', 'Barcode'. You can override using
## the `.groups` argument.
temp2 %>%
  ggplot(aes(y=BarcodeDiff))+
  geom_boxplot()+
  facet_wrap(~Family)

Genera richness and Shannon diversity of barcode-samples

Richness

dat<-PollenIDs %>%
  group_by(BeeID,Barcode)%>%
  summarise(GeneraRichness=n_distinct(Genus))
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
summary(stats::aov(GeneraRichness ~ Barcode, data = dat)) # sig difference in richness detected with rbcL versus ITS2
##              Df Sum Sq Mean Sq F value Pr(>F)    
## Barcode       1   1297  1297.4   113.9 <2e-16 ***
## Residuals   169   1926    11.4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_df<-dat %>% group_by(Barcode) %>%
  summarize(mean = round(mean(GeneraRichness), 2),
            err = stats::sd(GeneraRichness)/sqrt(length(GeneraRichness)),
            n = n()) %>% 
  dplyr::mutate(label = "n") %>% 
  unite("n_label", label, n, sep = " = ", remove = FALSE)

ggplot(plot_df, aes(x = Barcode, y = mean, fill = Barcode)) +
  geom_col(color = "black") +
  geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.4) +
  geom_label(aes(x = Barcode, y = 0.5, label = n_label), fill="white") +
  theme(legend.position = "none") +
  labs(x = "Barcode",
       y = "Mean Genera Richness",
       title = "Genera Richness")+
  theme_bw(base_size = 18)+
  theme(axis.title.x=element_text("none"),
        legend.position = "none")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"

Shannon diversity

#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
dat<- left_join(
  PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(Date:NestSeek), by="BeeID") %>% 
  left_join(., dat.Sites, by=c("Date", "SiteID")) %>% 
  mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
  select(Barcode, BeeID, Date, Bombus_spp, SiteID, Updated_x, Updated_y, HostGenus, CapTimeH, CapTimeM, NLCD1000_Water:NLCD1000_Crop, Prunus:Phleum)

dat<-as.data.frame(dat)
colnames(dat)
##   [1] "Barcode"             "BeeID"               "Date"               
##   [4] "Bombus_spp"          "SiteID"              "Updated_x"          
##   [7] "Updated_y"           "HostGenus"           "CapTimeH"           
##  [10] "CapTimeM"            "NLCD1000_Water"      "NLCD1000_Developed" 
##  [13] "NLCD1000_Forest"     "NLCD1000_Herbaceous" "NLCD1000_Crop"      
##  [16] "Prunus"              "Dicentra"            "Mertensia"          
##  [19] "Quercus"             "Lamium"              "Polemonium"         
##  [22] "Viola"               "NA"                  "Cardamine"          
##  [25] "Potentilla"          "Taraxacum"           "Buglossoides"       
##  [28] "Medicago"            "Cercis"              "Acer"               
##  [31] "Exochorda"           "Mummenhoffia"        "Halesia"            
##  [34] "Packera"             "Robinia"             "Platanus"           
##  [37] "Geum"                "Podophyllum"         "Thlaspi"            
##  [40] "Ranunculus"          "Viburnum"            "Syringa"            
##  [43] "Crataegus"           "Erigeron"            "Chelidonium"        
##  [46] "Leucojum"            "Photinia"            "Paulownia"          
##  [49] "Fontanesia"          "Hesperis"            "Aesculus"           
##  [52] "Rosa"                "Celastrus"           "Lamprocapnos"       
##  [55] "Pulmonaria"          "Aronia"              "Stylophorum"        
##  [58] "Pyrus"               "Malus"               "Fagus"              
##  [61] "Vicia"               "Berberis"            "Vaccinium"          
##  [64] "Impatiens"           "Gaylussacia"         "Anemone"            
##  [67] "Spiraea"             "Lonicera"            "Pinus"              
##  [70] "Salix"               "Anthoxanthum"        "Brassica"           
##  [73] "Elaeagnus"           "Glechoma"            "Liquidambar"        
##  [76] "Lupinus"             "Morus"               "Chamaecyparis"      
##  [79] "Veronica"            "Asimina"             "Cunninghamia"       
##  [82] "Lolium"              "Poa"                 "Vinca"              
##  [85] "Cynoglossum"         "Euscaphis"           "Akebia"             
##  [88] "Magnolia"            "Ginkgo"              "Picea"              
##  [91] "Trifolium"           "Barbarea"            "Dactylis"           
##  [94] "Fraxinus"            "Wisteria"            "Virgilia"           
##  [97] "Liriodendron"        "Nyssa"               "Gleditsia"          
## [100] "Vitis"               "Securigera"          "Gelsemium"          
## [103] "Pachysandra"         "Scilla"              "Toxicodendron"      
## [106] "Ligustrum"           "Cornus"              "Alliaria"           
## [109] "Rubus"               "Isatis"              "Phleum"
dat2<-as.matrix(dat %>% select(-c(Barcode:NLCD1000_Crop, 'NA')) %>% select(-any_of(singletonGenera)))
rownames(dat2)<-dat$BeeID
sort(colnames(dat2))
##  [1] "Acer"          "Aesculus"      "Anthoxanthum"  "Aronia"       
##  [5] "Asimina"       "Barbarea"      "Berberis"      "Brassica"     
##  [9] "Buglossoides"  "Cardamine"     "Celastrus"     "Cercis"       
## [13] "Chamaecyparis" "Cornus"        "Crataegus"     "Cunninghamia" 
## [17] "Dicentra"      "Elaeagnus"     "Erigeron"      "Exochorda"    
## [21] "Fagus"         "Fontanesia"    "Fraxinus"      "Gaylussacia"  
## [25] "Geum"          "Ginkgo"        "Glechoma"      "Gleditsia"    
## [29] "Halesia"       "Hesperis"      "Lamium"        "Liquidambar"  
## [33] "Liriodendron"  "Lolium"        "Lonicera"      "Lupinus"      
## [37] "Magnolia"      "Mertensia"     "Morus"         "Mummenhoffia" 
## [41] "Packera"       "Paulownia"     "Picea"         "Pinus"        
## [45] "Platanus"      "Poa"           "Podophyllum"   "Polemonium"   
## [49] "Potentilla"    "Prunus"        "Pyrus"         "Quercus"      
## [53] "Ranunculus"    "Robinia"       "Rosa"          "Salix"        
## [57] "Stylophorum"   "Syringa"       "Taraxacum"     "Thlaspi"      
## [61] "Trifolium"     "Vaccinium"     "Veronica"      "Viburnum"     
## [65] "Vicia"         "Vinca"         "Viola"         "Wisteria"
dat$ShannonD<-vegan::diversity(dat2, index="shannon")
summary(stats::aov(ShannonD ~ Barcode, data = dat)) # sig difference in diversity detected with rbcL versus ITS2
##              Df Sum Sq Mean Sq F value Pr(>F)   
## Barcode       1   2.24  2.2414   8.207 0.0047 **
## Residuals   169  46.15  0.2731                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_df <- dat  %>% 
  group_by(Barcode) %>% 
  # calculate mean and standard error of diversity
  summarize(mean = round(mean(ShannonD), 2),
            err = stats::sd(ShannonD)/sqrt(length(ShannonD)),
            n = n()) %>% 
  dplyr::mutate(label = "n") %>% 
  unite("n_label", label, n, sep = " = ", remove = FALSE)

ggplot(plot_df, aes(x = Barcode, y = mean, fill = Barcode)) +
  geom_col(color = "black") +
#  scale_fill_manual(values = colorBlindness::SteppedSequential5Steps) +
  geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.5) +
  geom_label(aes(x = Barcode, y = 0.05, label = n_label), fill="white") +
#  scale_y_continuous(limits = c(0, 2.75), expand = c(0,0)) +
  labs(x = "Barcode",
       y = "Mean Shannon diversity",
       title = "Shannon diversity")+
  theme_bw(base_size = 18)+
  theme(legend.position = "none")

rm(plot_df)

NMDS of barcode-samples

# visualizing how barcodes separate in multidimensional space

#dplyr::filters/selects that are useful
#  filter(!str_starts(SiteID, "BEF")) %>%  #remove Blandy
#   select(Cercis:Phleum) %>% #select metabarcoding data
#   filter(!is.na(Cercis)) %>% #remove rows without metabarcoding data
#   select(-any_of(singletonGenera)) # BUT de-select the singletons
# mutations that are helpful
#   mutate(across(everything(), ~as.numeric(.))))

#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
dat<- left_join(
  PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(Date:NestSeek), by="BeeID") %>% 
  left_join(., dat.Sites, by=c("Date", "SiteID")) %>% 
  mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
  select(Barcode, BeeID, Date, Bombus_spp, SiteID, Name, Updated_x, Updated_y, HostGenus, CapTimeH, CapTimeM, NLCD1000_Water:NLCD1000_Crop, Prunus:Phleum) 

dat2<-as.data.frame(dat %>% 
    ungroup() %>%
    select(Prunus:Phleum) %>% 
    select(-c(any_of(singletonGenera), "NA")))

colnames(dat2)
##  [1] "Prunus"        "Dicentra"      "Mertensia"     "Quercus"      
##  [5] "Lamium"        "Polemonium"    "Viola"         "Cardamine"    
##  [9] "Potentilla"    "Taraxacum"     "Buglossoides"  "Cercis"       
## [13] "Acer"          "Exochorda"     "Mummenhoffia"  "Halesia"      
## [17] "Packera"       "Robinia"       "Platanus"      "Geum"         
## [21] "Podophyllum"   "Thlaspi"       "Ranunculus"    "Viburnum"     
## [25] "Syringa"       "Crataegus"     "Erigeron"      "Paulownia"    
## [29] "Fontanesia"    "Hesperis"      "Aesculus"      "Rosa"         
## [33] "Celastrus"     "Aronia"        "Stylophorum"   "Pyrus"        
## [37] "Fagus"         "Vicia"         "Berberis"      "Vaccinium"    
## [41] "Gaylussacia"   "Lonicera"      "Pinus"         "Salix"        
## [45] "Anthoxanthum"  "Brassica"      "Elaeagnus"     "Glechoma"     
## [49] "Liquidambar"   "Lupinus"       "Morus"         "Chamaecyparis"
## [53] "Veronica"      "Asimina"       "Cunninghamia"  "Lolium"       
## [57] "Poa"           "Vinca"         "Magnolia"      "Ginkgo"       
## [61] "Picea"         "Trifolium"     "Barbarea"      "Fraxinus"     
## [65] "Wisteria"      "Liriodendron"  "Gleditsia"     "Cornus"
dat2mat<-as.matrix(dat2)

library(vegan)
## Loading required package: permute
## Loading required package: lattice
NMDS_barcode = metaMDS(dat2mat, k = 2, trymax = 20) #all data
## Run 0 stress 0.1764864 
## Run 1 stress 0.1815978 
## Run 2 stress 0.1762644 
## ... New best solution
## ... Procrustes: rmse 0.0347055  max resid 0.3131506 
## Run 3 stress 0.1780208 
## Run 4 stress 0.1795438 
## Run 5 stress 0.1774141 
## Run 6 stress 0.1769081 
## Run 7 stress 0.1794215 
## Run 8 stress 0.1802235 
## Run 9 stress 0.1797725 
## Run 10 stress 0.177031 
## Run 11 stress 0.1783333 
## Run 12 stress 0.1772149 
## Run 13 stress 0.1771803 
## Run 14 stress 0.1766901 
## ... Procrustes: rmse 0.04831964  max resid 0.4201105 
## Run 15 stress 0.17843 
## Run 16 stress 0.1804889 
## Run 17 stress 0.1800755 
## Run 18 stress 0.1759254 
## ... New best solution
## ... Procrustes: rmse 0.03248528  max resid 0.2699901 
## Run 19 stress 0.176513 
## Run 20 stress 0.1788741 
## *** Best solution was not repeated -- monoMDS stopping criteria:
##     12: no. of iterations >= maxit
##      8: stress ratio > sratmax
NMDS_barcode
## 
## Call:
## metaMDS(comm = dat2mat, k = 2, trymax = 20) 
## 
## global Multidimensional Scaling using monoMDS
## 
## Data:     dat2mat 
## Distance: bray 
## 
## Dimensions: 2 
## Stress:     0.1759254 
## Stress type 1, weak ties
## Best solution was not repeated after 20 tries
## The best solution was from try 18 (random start)
## Scaling: centring, PC rotation, halfchange scaling 
## Species: expanded scores based on 'dat2mat'
stressplot(NMDS_barcode)

#goodness(NMDS_barcode)
#ordiplot(NMDS_barcode, type="text", cex=1.2)

NMDS_plot_df<- cbind(dat %>% ungroup(), 
                 vegan::scores(NMDS_barcode, display = "sites") %>% 
  as.data.frame())

ggplot(NMDS_plot_df, aes(x=NMDS1, y=NMDS2, color=Barcode, shape = Barcode))+
  geom_point(size = 4, alpha=0.5) +
# scale_color_manual(values = colorBlindness::SteppedSequential5Steps)+
  stat_ellipse(linetype = 2, linewidth = 1) +
  scale_x_continuous(limits=c(-4.5,1.5), breaks = c(-4.5,-3.5,-1,-0.5,0,0.5,1,1.5,2))+
  ggbreak::scale_x_break(breaks=c(-3.5,-1), scale=4)+
#  labs(title = "All Data")+
  theme_bw(base_size = 18)

#Try removing samples appearing as outliers in the intital NMDS

# which(NMDS_plot_df$NMDS1>1)
# which(NMDS_plot_df$NMDS1<(-3))
# View(dat[c(21,30,34,46),])
# 
# NMDS_barcode2 = metaMDS(dat2mat[-c(21,30,34,46),], k = 2, trymax = 20) #removing outliers from previous
# stressplot(NMDS_barcode2)
# NMDS_barcode2
# 
# NMDS_plot_df2<- cbind(dat[-c(21,30,34,46),] %>% ungroup(),
#                       vegan::scores(NMDS_barcode2, display = "sites") %>%
#    as.data.frame())
# 
# ggplot(NMDS_plot_df2, aes(x=NMDS1, y=NMDS2, color=Barcode, shape=Barcode))+
#   geom_point(size = 4, alpha=0.5) +
# # scale_color_manual(values = colorBlindness::SteppedSequential5Steps)+
#   stat_ellipse(linetype = 2, linewidth = 1) +
#   labs(title = "Excluding Outliers")



# Try removing samples without data from both barcodes

# dat<- left_join(
#   PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
#   dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
#   left_join(., dat.Sites, by=c("Date", "SiteID")) %>%
#   mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
#   select(Barcode, BeeID, Date, Bombus_spp, SiteID, Name, Updated_x, Updated_y, HostName, HostGenus, CapTimeH, CapTimeM, NLCD1000_Water:NLCD1000_Crop, Prunus:Phleum) %>% filter(BeeID %in% both) # ADDITIONAL FILTER: restrict NMDS to just samples with data from both barcodes)
# dat2<-as.data.frame(dat %>%
#     ungroup() %>%
#     select(Prunus:Phleum) %>%
#     select(-c(any_of(singletonGenera), "NA")))
# colnames(dat2)
# dat2mat<-as.matrix(dat2)
# 
# NMDS_barcode4 = metaMDS(dat2mat, k = 2, trymax = 20) #all data
# NMDS_barcode4
# stressplot(NMDS_barcode4)
# goodness(NMDS_barcode4)
# ordiplot(NMDS_barcode4, type="text", cex=1.2)
# 
# NMDS_plot_df<- cbind(dat %>% ungroup(),
#                  vegan::scores(NMDS_barcode4, display = "sites") %>%
#   as.data.frame())
# 
# ggplot(NMDS_plot_df, aes(x=NMDS1, y=NMDS2, color=Barcode))+
#   geom_point(size = 4, alpha=0.5) +
# # scale_color_manual(values = colorBlindness::SteppedSequential5Steps)+
#   stat_ellipse(linetype = 2, linewidth = 1) +
#   labs(title = "Only Samples w/ Both Barcodes")


detach('package:vegan')

So the databases contain the same genera detected in the samples. But the average richness is higher in rbcL, as well as shannon diversity. And all three NMDS show rbcL clustering separately than ITS2.

Does pollen composition vary among Bombus species? (YES Blandy, NO Rare bees)

Chi-squared, Fisher’s exact test for proportion of reads assigned to most common pollen genera

# construct contingency table
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
  filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") %>%
  filter(.data$Genus %in% topGenera2$Genus) %>%
  group_by(Bombus_spp, BeeID, Genus) %>% #first summarize across barcode within samples, genus
  summarize(PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1))) %>% 
#  filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
  group_by(Bombus_spp, Genus) %>% #next summarize across samples within Bombus spp, genus
  summarise(PropReads=sum(PropReads)) %>%
  pivot_wider(names_from = "Genus", values_from = "PropReads", values_fill = 0) %>%
  as.data.frame(.)
## `summarise()` has grouped output by 'Bombus_spp', 'BeeID'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'Bombus_spp'. You can override using the
## `.groups` argument.
rownames(dat)<-dat$Bombus_spp
dat<-dat %>% select(-Bombus_spp) %>% drop_na()
stats::chisq.test(dat)$expected # Expected counts < 5 ==> chi square approx. maybe doubtful
## Warning in stats::chisq.test(dat): Chi-squared approximation may be incorrect
##                   Acer  Aesculus  Barbarea   Cercis Elaeagnus  Glechoma
## bimaculatus  0.7844243 1.5517698 0.7243491 6.745888 0.9631372 1.7911066
## griseocollis 0.2790733 0.5520704 0.2577004 2.399973 0.3426537 0.6372189
## impatiens    0.1577744 0.3121137 0.1456912 1.356828 0.1937197 0.3602525
##                Halesia   Lamium Liriodendron  Lonicera Mertensia Mummenhoffia
## bimaculatus  2.3516963 5.375135    1.1890315 2.2500301 2.1920186    0.5701779
## griseocollis 0.8366589 1.912302    0.4230197 0.8004893 0.7798507    0.2028512
## impatiens    0.4730061 1.081122    0.2391547 0.4525576 0.4408895    0.1146822
##                  Pinus    Prunus   Quercus     Salix Taraxacum Vaccinium
## bimaculatus  3.1528013 4.7091124 0.8307746 4.7794906 0.7422049 0.8301126
## griseocollis 1.1216666 1.6753527 0.2955632 1.7003911 0.2640530 0.2953277
## impatiens    0.6341356 0.9471627 0.1670970 0.9613182 0.1492827 0.1669639
##               Viburnum     Viola  Paulownia
## bimaculatus  0.7895577 1.7715797 0.44578624
## griseocollis 0.2808996 0.6302718 0.15859660
## impatiens    0.1588069 0.3563250 0.08966278
(stats::chisq.test(dat)$expected)<5 # if > 20% of expected cell counts are less than 5, then use Fisher's exact test
## Warning in stats::chisq.test(dat): Chi-squared approximation may be incorrect
##              Acer Aesculus Barbarea Cercis Elaeagnus Glechoma Halesia Lamium
## bimaculatus  TRUE     TRUE     TRUE  FALSE      TRUE     TRUE    TRUE  FALSE
## griseocollis TRUE     TRUE     TRUE   TRUE      TRUE     TRUE    TRUE   TRUE
## impatiens    TRUE     TRUE     TRUE   TRUE      TRUE     TRUE    TRUE   TRUE
##              Liriodendron Lonicera Mertensia Mummenhoffia Pinus Prunus Quercus
## bimaculatus          TRUE     TRUE      TRUE         TRUE  TRUE   TRUE    TRUE
## griseocollis         TRUE     TRUE      TRUE         TRUE  TRUE   TRUE    TRUE
## impatiens            TRUE     TRUE      TRUE         TRUE  TRUE   TRUE    TRUE
##              Salix Taraxacum Vaccinium Viburnum Viola Paulownia
## bimaculatus   TRUE      TRUE      TRUE     TRUE  TRUE      TRUE
## griseocollis  TRUE      TRUE      TRUE     TRUE  TRUE      TRUE
## impatiens     TRUE      TRUE      TRUE     TRUE  TRUE      TRUE
#stats::chisq.test(dat)
stats::fisher.test(dat, simulate.p.value=TRUE)
## Warning in stats::fisher.test(dat, simulate.p.value = TRUE): 'x' has been
## rounded to integer: Mean relative difference: 0.1697392
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  dat
## p-value = 0.006997
## alternative hypothesis: two.sided

Plot pollen composition among species

# Plot the data! prop of reads assigned to top genera for each bombus spp
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
  mutate(FamGen=paste(Family, Genus, sep=" "),
         unCommon=ifelse(
           Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens",
           "common","uncommon")) %>% 
  group_by(BeeID, Family, FamGen) %>%
  summarise(BeeID=first(BeeID),
            Bombus_spp=first(Bombus_spp),
            unCommon=first(unCommon),
            PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
            Family=first(Family),
            Genus=first(Genus),
            FamGen=first(FamGen)
            ) %>% ungroup() %>% arrange(FamGen)
## `summarise()` has grouped output by 'BeeID', 'Family'. You can override using
## the `.groups` argument.
dat %>%
  ggplot(aes(x=Bombus_spp)) +
  geom_col(aes(y=PropReads, 
               fill=ifelse(.data$Genus %in% topGenera2$Genus, FamGen, 
                       ifelse(!is.na(.data$Genus), "z_AllOther", NA))), 
              position="fill") +
  scale_fill_manual(values = myColors.FamGen, na.value="gray")+ #FamGen.palette
  scale_x_discrete(labels = c("pensylvanicus"="pensyl-\nvanicus", "vagans-sandersoni-perplexus" = "vagans-\nsandersoni-\nperplexus")) +
  geom_label(data=dat %>%
               group_by(Bombus_spp) %>%
               summarize(num=n_distinct(BeeID)),
             aes(x=Bombus_spp, y=-0.05, label=paste("n = ", num, sep="")))+
  guides(fill=guide_legend(title = "Family Genus"))+
  theme_bw(base_size = 16)+
  labs(title="", y="Proportion of Sample Reads", x="Bombus species")+
  theme(legend.text = element_text(size=10),
        legend.position = "bottom")

# zoom in: plot the "z_AllOther" pollen families to get an idea of the taxa we're missing from the above plot. 
# this plot shows the proportion of "AllOther" reads assigned to taxonomic family
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
  group_by(BeeID, Family, Genus) %>%
  summarise(BeeID=first(BeeID),
            Bombus_spp=first(Bombus_spp),
            PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
            Family=first(Family),
            Genus=first(Genus),
            ) %>% ungroup() %>%
  filter(!.data$Genus %in% topGenera2$Genus,!is.na(.data$Family))
## `summarise()` has grouped output by 'BeeID', 'Family'. You can override using
## the `.groups` argument.
plot2<-dat %>%
 ggplot(aes(x=Bombus_spp)) +
  geom_col(aes(x=Bombus_spp, y=PropReads,
               fill=forcats::fct_lump_n(Family, 19)),
              position="fill") +
  scale_x_discrete(labels = c("pensylvanicus"="pensyl-\nvanicus", "vagans-sandersoni-perplexus" = "vagans-\nsandersoni-\nperplexus")) +
  scale_fill_manual(values=c("#990F0F", "#B22C2C" , "#CC5151" , "#E57E7E" , "#FFB2B2" , "#99540F" , "#B26F2C" , "#CC8E51" , "#E5B17E" , "#FFD8B2" , "#6B990F" , "#85B22C" , "#A3CC51" , "#C3E57E" , "#E5FFB2" , "#0F6B99" , "#2C85B2" , "#51A3CC" , "#7EC3E5" , "#B2E5FF" , "#8F7EE5", "black")) +
  #  scale_fill_brewer(palette = "Paired", na.value = "black")+ #FamGen.palette
  guides(fill=guide_legend(title = "Family"))+
  theme_bw(base_size = 16)+
  labs(title="Subset 'AllOther' Taxa", ylab="Proportion of 'AllOther' Reads", x="Bombus species")+
  theme(axis.text.x = element_text(size=12))+
  theme(legend.position = "bottom")
# 
# ggpubr::ggarrange(plot1, plot2, ncol = 1, nrow = 2)

ANOVA genera richness, Shannon diversity ~ Bombus_spp

Genera richness

dat<-PollenIDs %>%
  left_join(., dat.Queen %>% select(BeeID, SiteID, Bombus_spp),by="BeeID") %>%
  filter(!Genus %in% singletonGenera) %>% #filter to REMOVE the singleton genera
#  filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
  group_by(BeeID,Bombus_spp)%>%
  summarise(GeneraRichness=n_distinct(Genus)) %>%
  filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") #use this when running aov() on just the common bumble bee species
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
summary(stats::aov(GeneraRichness ~ Bombus_spp, data = dat)) # no sig difference in richness between 4 most common species, *****unless you exclude bees without data from BOTH barcodes
##             Df Sum Sq Mean Sq F value Pr(>F)
## Bombus_spp   2   55.9   27.94   1.117  0.332
## Residuals   90 2252.0   25.02
plot_df <- dat %>% 
  filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"| Bombus_spp=="impatiens") %>%
#  filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
  group_by(Bombus_spp) %>% 
  # calculate mean and standard error of diversity
  summarize(mean = round(mean(GeneraRichness), 2),
            err = stats::sd(GeneraRichness)/sqrt(length(GeneraRichness)),
            n = n()) %>% 
  dplyr::mutate(label = "n") %>% 
  unite("n_label", label, n, sep = " = ", remove = FALSE)

ggplot(plot_df, aes(x = Bombus_spp, y = mean)) +
  geom_col(color = "black", aes(fill = Bombus_spp)) +
#  scale_fill_manual(values = colorBlindness::Blue2DarkRed18Steps) +
  geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.5) +
  geom_label(aes(x = Bombus_spp, y = 1, label = n_label)) +
  labs(x = "Bombus species",
       y = "Mean Genera Richness",
       title = "All Data (incl. rbcL or ITS2 only samples)")+
  theme_bw(base_size = 18)+
  theme(legend.position = "none")

Shannon diversity

dat<- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(Date:NestSeek), by="BeeID") %>% 
  arrange(BeeID) %>% 
  filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
  mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
  group_by(BeeID) %>% summarise(across(Prunus:Phleum, ~mean(.)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)))) %>%
  select(Prunus:Phleum) %>% # select just read data
  select(-any_of(singletonGenera)) # de-select the singleton genera
dat<-as.matrix(dat)

dat2<-left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(Date:NestSeek), by="BeeID") %>% 
  left_join(., dat.Sites %>% select(SiteID, Name), by="SiteID")%>%
  arrange(BeeID) %>%
  filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
  group_by(BeeID) %>% summarise(Bombus_spp=first(Bombus_spp), across(Prunus:Phleum, ~mean(.)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)))) %>%
  select(BeeID, Bombus_spp) %>%
  ungroup() 

rownames(dat)<-dat2$BeeID


# for the old version of dat and dat2
# rownames(dat)<-paste(dat2$BeeID, dat2$Barcode, sep="_")
# dat2$BeeIDBarcode<-paste(dat2$BeeID, dat2$Barcode, sep="_")

dat2$ShannonD<-vegan::diversity(dat, index="shannon")
dat2<- dat2 %>% filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") #use this when running aov() on just the common bumble bee species
summary(stats::aov(ShannonD ~ Bombus_spp, data = dat2)) # WOAH THERE IS sig difference in shannon diversity of pollens collected by three common bombus spp when you only include bees with both barcodes
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Bombus_spp   2  2.633  1.3166   4.268  0.019 *
## Residuals   54 16.659  0.3085                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_df <- dat2 %>% 
  filter(Bombus_spp=="bimaculatus" | Bombus_spp=="griseocollis"| Bombus_spp=="impatiens") %>%
  group_by(Bombus_spp) %>% 
  # calculate mean and standard error of diversity
  summarize(mean = round(mean(ShannonD), 2),
            err = stats::sd(ShannonD)/sqrt(length(ShannonD)),
            n = n_distinct(BeeID)) %>% 
  dplyr::mutate(label = "n") %>% 
  unite("n_label", label, n, sep = " = ", remove = FALSE)

ggplot(plot_df, aes(x = Bombus_spp, y = mean)) +
  geom_col(color = "black", aes(fill = Bombus_spp)) +
#  scale_fill_manual(values = colorBlindness::Blue2DarkRed18Steps) +
  geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.5) +
  geom_label(aes(x = Bombus_spp, y = 0.1, label = n_label)) +
  labs(x = "Bombus_spp",
       y = "Mean Shannon diversity",
       title = "All Data (incl. rbcL or ITS2 only samples)") +
  theme_bw(base_size = 18)+
  theme(legend.position = "none")

Redundancy analysis

#this dataset is like dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
dat <- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(Date:NestSeek), by="BeeID") %>% 
  left_join(., dat.Sites, by=c("Date", "SiteID")) %>% 
  mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
  mutate(CapTimeHM=hms::hms(hours=CapTimeH, minutes=CapTimeM, seconds=0), JDate=julian(Date)) %>%  #create variables: CapTimeHM and julian date JDate
  mutate(HostGenus=ifelse(is.na(.data$HostGenus),"none",HostGenus)) %>%
  select(Barcode, BeeID, JDate, Bombus_spp, SiteID, Name, Updated_x, Updated_y, HostGenus, CapTimeHM, Prunus:Phleum) %>%
  rowwise() %>% mutate(max=max(c_across(Prunus:Phleum))) %>% 
  filter(Bombus_spp=="bimaculatus" | Bombus_spp=="griseocollis"| Bombus_spp=="impatiens") %>%
#  filter(Name=="Blandy Experimental Farm") %>%
#  filter(BeeID %in% TwoBarcodeBees$BeeID) %>%
  ungroup()

summary(dat %>% filter(HostGenus=="Elaeagnus") %>% select(Elaeagnus))
##    Elaeagnus      
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.05672  
##  3rd Qu.:0.03598  
##  Max.   :0.55718
summary(dat %>% filter(HostGenus=="Glechoma") %>% select(Glechoma))
##     Glechoma      
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.02637  
##  Mean   :0.09778  
##  3rd Qu.:0.13449  
##  Max.   :0.60859
summary(dat %>% filter(HostGenus=="Lamium") %>% select(Lamium))
##      Lamium       
##  Min.   :0.00000  
##  1st Qu.:0.05297  
##  Median :0.66332  
##  Mean   :0.51616  
##  3rd Qu.:0.87530  
##  Max.   :0.97651
# explanatory variables must be centered, standardized (if explanatory variables are in different units), transformed (to limit the skew of explanatory variables) or normalized (to linearize relationships) following the same principles as in PCA


# sample size by Bombus species and floral host
dat %>% group_by(Bombus_spp) %>% summarise(n=n_distinct(BeeID)) 
## # A tibble: 3 × 2
##   Bombus_spp       n
##   <fct>        <int>
## 1 bimaculatus     56
## 2 griseocollis    26
## 3 impatiens       11
dat %>% group_by(HostGenus) %>% summarise(n=n_distinct(BeeID)) 
## # A tibble: 17 × 2
##    HostGenus         n
##    <chr>         <int>
##  1 ""                4
##  2 "Barbarea"        5
##  3 "Cercis"          2
##  4 "Chaenomeles"     1
##  5 "Elaeagnus"      26
##  6 "Glechoma"       18
##  7 "Hellebores"      1
##  8 "Lamium"         12
##  9 "Lonicera"        1
## 10 "Malus"           3
## 11 "Mertensia"       4
## 12 "Photina"         1
## 13 "Polemonium"      1
## 14 "Prunus"          8
## 15 "Pyracantha"      4
## 16 "Salix"           1
## 17 "Spiraea"         1
# these counts incl double counts of samples with data from more than one barcode
hist(scale(as.numeric(dat$JDate)))

hist(scale(as.numeric(dat$CapTimeHM)))

dat$JDate<-scale(as.numeric(dat$JDate))
dat$CapTimeHM<-scale(as.numeric(dat$CapTimeHM))
dat$Updated_y<-scale(as.numeric(dat$Updated_y))

dat2<-as.data.frame(dat %>% 
    ungroup() %>%
    select(Prunus:Phleum) %>% 
    select(-c(any_of(singletonGenera), "NA")) #remove genera detected only once
      )
dat2<-as.matrix(dat2)
rownames(dat2)<-dat$BeeID

dat2<-vegan::decostand(dat2, method = "hellinger") # community data standardization

library(vegan)
# global model1 should include: 
#dat2 ~ Bombus_spp + Barcode + JDate + CapTimeHM + HostGenus + Updated_y
#dat2 ~ Bombus_spp + Barcode + JDate + CapTimeHM  + Updated_y (no host model)

global.RDA.step<-ordiR2step(rda(dat2~1, data=dat), scope = rda(dat2 ~ Bombus_spp + Barcode + JDate + CapTimeHM + HostGenus + Updated_y, data=dat))
## Step: R2.adj= 0 
## Call: dat2 ~ 1 
##  
##                 R2.adjusted
## <All variables>  0.31347007
## + HostGenus      0.20083371
## + JDate          0.05254167
## + Barcode        0.04603774
## + Bombus_spp     0.03631868
## + Updated_y      0.02688220
## + CapTimeHM      0.00929700
## <none>           0.00000000
## 
##             Df     AIC      F Pr(>F)   
## + HostGenus 16 -37.986 3.3403  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.2008337 
## Call: dat2 ~ HostGenus 
##  
##                 R2.adjusted
## <All variables>   0.3134701
## + Barcode         0.2499960
## + JDate           0.2336344
## + Updated_y       0.2196355
## + Bombus_spp      0.2160232
## + CapTimeHM       0.2050629
## <none>            0.2008337
## 
##           Df     AIC      F Pr(>F)   
## + Barcode  1 -46.642 9.7181  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.249996 
## Call: dat2 ~ HostGenus + Barcode 
##  
##                 R2.adjusted
## <All variables>   0.3134701
## + JDate           0.2803462
## + Updated_y       0.2693801
## + Bombus_spp      0.2661645
## + CapTimeHM       0.2546322
## <none>            0.2499960
## 
##         Df     AIC      F Pr(>F)   
## + JDate  1 -51.979 6.5669  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.2803462 
## Call: dat2 ~ HostGenus + Barcode + JDate 
##  
##                 R2.adjusted
## <All variables>   0.3134701
## + Updated_y       0.2968662
## + Bombus_spp      0.2946883
## + CapTimeHM       0.2839004
## <none>            0.2803462
## 
##             Df     AIC      F Pr(>F)   
## + Updated_y  1 -54.612 4.0778  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.2968662 
## Call: dat2 ~ HostGenus + Barcode + JDate + Updated_y 
##  
##                 R2.adjusted
## <All variables>   0.3134701
## + Bombus_spp      0.3109382
## + CapTimeHM       0.3001803
## <none>            0.2968662
## 
##              Df    AIC      F Pr(>F)   
## + Bombus_spp  2 -55.97 2.3274  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.3109382 
## Call: dat2 ~ HostGenus + Barcode + JDate + Updated_y + Bombus_spp 
##  
##                 R2.adjusted
## + CapTimeHM       0.3134701
## <All variables>   0.3134701
## <none>            0.3109382
# sample size of model
bees<-length(unique(dat$BeeID))
records<-length(dat$BeeID)

#only bees with both barcodes, best model
#RDA_Bombus<-vegan::rda(dat2 ~ HostGenus + Barcode + Bombus_spp + Updated_y + JDate, data=dat) 

#all data best model
RDA_Bombus<-vegan::rda(dat2 ~ HostGenus + Barcode + JDate + Updated_y + Bombus_spp, data=dat) 

#no host best model
#RDA_Bombus<-vegan::rda(dat2 ~ Bombus_spp + Barcode + JDate + Updated_y + CapTimeHM, data=dat) 


#for getting stats from best model
RsquareAdj(RDA_Bombus) 
## $r.squared
## [1] 0.4080543
## 
## $adj.r.squared
## [1] 0.3109382
anova.cca(RDA_Bombus, by="term")
## Permutation test for rda under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: rda(formula = dat2 ~ HostGenus + Barcode + JDate + Updated_y + Bombus_spp, data = dat)
##             Df Variance       F Pr(>F)    
## HostGenus   16  0.25034  3.8740  0.001 ***
## Barcode      1  0.04272 10.5775  0.001 ***
## JDate        1  0.02770  6.8584  0.001 ***
## Updated_y    1  0.01681  4.1611  0.001 ***
## Bombus_spp   2  0.01880  2.3274  0.002 ** 
## Residual   128  0.51697                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(RDA_Bombus)[c(7:9,11)]
## $tot.chi
## [1] 0.8733337
## 
## $constr.chi
## [1] 0.3563675
## 
## $unconst.chi
## [1] 0.5169662
## 
## $concont
## $concont$importance
## Importance of components:
##                          RDA1    RDA2    RDA3    RDA4    RDA5    RDA6    RDA7
## Eigenvalue            0.07344 0.06484 0.04732 0.04091 0.03229 0.01874 0.01521
## Proportion Explained  0.20609 0.18196 0.13278 0.11480 0.09061 0.05259 0.04268
## Cumulative Proportion 0.20609 0.38805 0.52083 0.63562 0.72624 0.77883 0.82151
##                          RDA8    RDA9   RDA10    RDA11    RDA12    RDA13
## Eigenvalue            0.01298 0.01186 0.01048 0.007264 0.005068 0.004534
## Proportion Explained  0.03642 0.03328 0.02942 0.020383 0.014221 0.012723
## Cumulative Proportion 0.85793 0.89122 0.92064 0.941021 0.955242 0.967965
##                          RDA14    RDA15    RDA16    RDA17     RDA18     RDA19
## Eigenvalue            0.002976 0.002335 0.002217 0.001436 0.0009808 0.0007399
## Proportion Explained  0.008352 0.006552 0.006220 0.004029 0.0027523 0.0020763
## Cumulative Proportion 0.976317 0.982870 0.989090 0.993119 0.9958708 0.9979472
##                           RDA20     RDA21
## Eigenvalue            0.0004455 0.0002861
## Proportion Explained  0.0012500 0.0008028
## Cumulative Proportion 0.9991972 1.0000000
#vegan::ordiplot(RDA_Bombus, scaling=1, type="text")
# Scaling 1 shows similarities between objects in the response matrix.
# Sites (numbers) that are closer together have more similar communities.
# Species that are closer together occupy more sites in common.
RDA_plot_df<-vegan::ordiplot(RDA_Bombus, scaling=2, type = "text")

# Scaling 2 shows the effects of explanatory variables.
# Longer arrows mean this variable strongly drives the variation in the community matrix.
# Arrows pointing in opposite directions have a negative relationship.
# Arrows pointing in the same direction have a positive relationship.

#an RDA with host, County, Barcode, Date, and Bombus species can explain ~30% of the total variation in pollen community composition.

#grabbing data from rda model (or the ordiplot thereof) for plotting in ggplot
sites.long <- BiodiversityR::sites.long(RDA_plot_df, env.data=dat)
species.long <- BiodiversityR::species.long(RDA_plot_df)
species.long2<- species.long %>% 
  mutate(diff=sqrt(abs(axis1)^2+abs(axis2)^2)) %>% 
  filter(diff>0.15) %>%
  mutate(labels=paste("Pollen_",labels,sep=""))
axis.long <- BiodiversityR::axis.long(RDA_Bombus, choices=c(1, 2))


arrows <- as.data.frame(vegan::scores(RDA_Bombus)$biplot) #
arrows$names<-rownames(arrows)

arrows <- arrows %>%        # select only Host and Barcode arrows greater than 0.2 units from 0,0
  mutate(diff=sqrt(abs(RDA1)^2+abs(RDA2)^2)) %>% 
  filter(
    !str_starts(names, "Bombus"), #Bombus will be visualized with color
    !str_starts(names,"Barcode") #Barcode will be visualized with shape
    # diff>0.15
         ) %>%
  rowwise() %>% mutate(
    names=ifelse(str_starts(names, "Host"),
      paste("Host_", strsplit(names, split="HostGenus")[[1]][2], sep=""),
                ifelse(str_starts(names, "Barcode"),
             strsplit(names, split="Barcode")[[1]][2],
                        names)) # making names pretty
    ) %>%
  mutate( # create variable to flag and filter hosts with only 1 obs, keep everything else
    keep=ifelse(names %in% ((dat %>% group_by(HostGenus) %>% summarise(n=n_distinct(BeeID)) %>% filter(n>1) %>% mutate(HostGenus=paste("Host_",HostGenus,sep="")))$HostGenus), 1,
              ifelse(!str_starts(names,"Host"), 1, 0))) %>% filter(keep==1) %>% select(-keep) # remove filter variable

colnames(arrows)<-c("axis1","axis2","labels","diff")
rbind(species.long2, arrows)
##                 axis1       axis2           labels      diff
## Prunus    -0.11288000  0.52102630    Pollen_Prunus 0.5331138
## Mertensia -0.17866428  0.09942979 Pollen_Mertensia 0.2044681
## Lamium     0.35279099  0.39807614    Pollen_Lamium 0.5319080
## Cercis    -0.79234535 -0.16051723    Pollen_Cercis 0.8084411
## Acer      -0.01027997  0.16465866      Pollen_Acer 0.1649793
## Platanus   0.08641551 -0.16057704  Pollen_Platanus 0.1823530
## Lonicera   0.10495084 -0.25183148  Pollen_Lonicera 0.2728255
## Pinus      0.24795259 -0.24706315     Pollen_Pinus 0.3500296
## Salix      0.22120681 -0.28329771     Pollen_Salix 0.3594302
## Elaeagnus  0.05751494 -0.17683115 Pollen_Elaeagnus 0.1859495
## 1          0.19182521 -0.02551423    Host_Barbarea 0.1935146
## 2         -0.42554002 -0.11812127      Host_Cercis 0.4416299
## 3         -0.05601890 -0.40583902   Host_Elaeagnus 0.4096870
## 4         -0.01229819  0.10829750    Host_Glechoma 0.1089935
## 5          0.41803372  0.33924760      Host_Lamium 0.5383689
## 6          0.11946913 -0.16560654       Host_Malus 0.2042019
## 7         -0.53335114 -0.08278345   Host_Mertensia 0.5397375
## 8         -0.13443660  0.44378379      Host_Prunus 0.4636995
## 9          0.07244374 -0.05486005  Host_Pyracantha 0.0908720
## 10         0.21647657 -0.71850933            JDate 0.7504117
## 11        -0.31734438 -0.36164683        Updated_y 0.4811402
plot1<- ggplot() + 
    geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
    geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +  
    labs(x=axis.long[1, "label"], y=axis.long[2, "label"]) +  
    scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
    scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) + 
    geom_point(data=sites.long, 
               aes(x=axis1, y=axis2, color=Bombus_spp, shape=Barcode), 
               size=5, alpha=0.7) +
#    scale_shape_manual(values=c(16, 17)) +
#    stat_ellipse(data=sites.long, aes(x=axis1, y=axis2, colour=Bombus_spp), linewidth = 1) +
    geom_point(data=species.long, aes(x=axis1, y=axis2), size=3, alpha=0.7) +
    ggrepel::geom_text_repel(data=rbind(species.long2, arrows), aes(x = axis1, y = axis2, label = labels), direction = "both", size=5, box.padding = 0.75) +
    geom_segment(data=arrows, aes(x = 0, xend = axis1, y = 0, yend = axis2), arrow = arrow(length = unit(0.25, "cm")), colour="black") +
    coord_fixed(ratio=1) +
    theme_bw(base_size = 18)+
    labs(title=paste("Bombus species differences (n = ",records," records from ",bees," bees)", sep=""))+
    theme(axis.title = element_text(size=18), 
        axis.text=element_text(size=14))
plot1
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

detach('package:vegan')

#whether or not you exclude samples without both barcodes, the model tells the same story

Does pollen composition vary with land use? (NO Blandy, YES Rare bees)

LMM Shannon diversity of pollen ~ site land use

We also and fit a mixed effects linear model with four land use variables (i.e., Forest, Herb, Urban, Crop), survey date and sample collection time, using the lme function of the nlme package.

#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
# environmental variables dataset
dat<- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(Date:NestSeek), by="BeeID") %>% 
  left_join(., dat.Sites, by=c("Date", "SiteID")) %>% 
  mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
  mutate(CapTimeHM=hms::hms(hours=CapTimeH, minutes=CapTimeM, seconds=0), JDate=julian(Date)) %>%  #create variables
  select(Barcode, BeeID, SiteID, Name, 
         JDate, CapTimeHM, 
         Updated_y, NLCD1000_Water:NLCD1000_Crop, 
         Prunus:Phleum, 
         -"NA") %>%
#  filter(BeeID %in% TwoBarcodeBees$BeeID) %>%
  filter(Name!="Blandy Experimental Farm") %>% #remove Blandy bees
  ungroup()

dat
## # A tibble: 76 × 107
##    Barcode BeeID   SiteID Name          JDate CapTimeHM Updated_y NLCD1000_Water
##    <chr>   <fct>   <fct>  <chr>         <drt> <time>        <dbl>          <dbl>
##  1 ITS2    CKC0001 GMCP1  Gertrude E. … 1908… 14:25          38.9       0       
##  2 ITS2    ESE0004 DTP2   Darden Towe … 1909… 13:04          38.0       0.0194  
##  3 ITS2    KLS0007 ICF1   Ivy Creek Fo… 1907… 14:51          38.1       0.0988  
##  4 ITS2    KLS0044 VHVG1  Vint Hill Vi… 1908… 12:04          38.7       0.000578
##  5 ITS2    KLS0045 VHVG1  Vint Hill Vi… 1908… 12:06          38.7       0.000578
##  6 ITS2    KLS0052 GMCP1  Gertrude E. … 1908… 13:38          38.9       0       
##  7 ITS2    KLS0054 GMCP1  Gertrude E. … 1908… 14:15          38.9       0       
##  8 ITS2    KLS0055 GMCP1  Gertrude E. … 1908… 14:16          38.9       0       
##  9 ITS2    KLS0071 SP2    Seilheimer P… 1909… 09:56          38.0       0.0271  
## 10 ITS2    KLS0096 ICF2   Ivy Creek Fo… 1909… 09:52          38.1       0.0988  
## # ℹ 66 more rows
## # ℹ 99 more variables: NLCD1000_Developed <dbl>, NLCD1000_Forest <dbl>,
## #   NLCD1000_Herbaceous <dbl>, NLCD1000_Crop <dbl>, Prunus <dbl>,
## #   Dicentra <dbl>, Mertensia <dbl>, Quercus <dbl>, Lamium <dbl>,
## #   Polemonium <dbl>, Viola <dbl>, Cardamine <dbl>, Potentilla <dbl>,
## #   Taraxacum <dbl>, Buglossoides <dbl>, Medicago <dbl>, Cercis <dbl>,
## #   Acer <dbl>, Exochorda <dbl>, Mummenhoffia <dbl>, Halesia <dbl>, …
# does shannon diversity of pollen samples differ among sites? across land use contexts?
dat<-as.data.frame(dat)
colnames(dat)
##   [1] "Barcode"             "BeeID"               "SiteID"             
##   [4] "Name"                "JDate"               "CapTimeHM"          
##   [7] "Updated_y"           "NLCD1000_Water"      "NLCD1000_Developed" 
##  [10] "NLCD1000_Forest"     "NLCD1000_Herbaceous" "NLCD1000_Crop"      
##  [13] "Prunus"              "Dicentra"            "Mertensia"          
##  [16] "Quercus"             "Lamium"              "Polemonium"         
##  [19] "Viola"               "Cardamine"           "Potentilla"         
##  [22] "Taraxacum"           "Buglossoides"        "Medicago"           
##  [25] "Cercis"              "Acer"                "Exochorda"          
##  [28] "Mummenhoffia"        "Halesia"             "Packera"            
##  [31] "Robinia"             "Platanus"            "Geum"               
##  [34] "Podophyllum"         "Thlaspi"             "Ranunculus"         
##  [37] "Viburnum"            "Syringa"             "Crataegus"          
##  [40] "Erigeron"            "Chelidonium"         "Leucojum"           
##  [43] "Photinia"            "Paulownia"           "Fontanesia"         
##  [46] "Hesperis"            "Aesculus"            "Rosa"               
##  [49] "Celastrus"           "Lamprocapnos"        "Pulmonaria"         
##  [52] "Aronia"              "Stylophorum"         "Pyrus"              
##  [55] "Malus"               "Fagus"               "Vicia"              
##  [58] "Berberis"            "Vaccinium"           "Impatiens"          
##  [61] "Gaylussacia"         "Anemone"             "Spiraea"            
##  [64] "Lonicera"            "Pinus"               "Salix"              
##  [67] "Anthoxanthum"        "Brassica"            "Elaeagnus"          
##  [70] "Glechoma"            "Liquidambar"         "Lupinus"            
##  [73] "Morus"               "Chamaecyparis"       "Veronica"           
##  [76] "Asimina"             "Cunninghamia"        "Lolium"             
##  [79] "Poa"                 "Vinca"               "Cynoglossum"        
##  [82] "Euscaphis"           "Akebia"              "Magnolia"           
##  [85] "Ginkgo"              "Picea"               "Trifolium"          
##  [88] "Barbarea"            "Dactylis"            "Fraxinus"           
##  [91] "Wisteria"            "Virgilia"            "Liriodendron"       
##  [94] "Nyssa"               "Gleditsia"           "Vitis"              
##  [97] "Securigera"          "Gelsemium"           "Pachysandra"        
## [100] "Scilla"              "Toxicodendron"       "Ligustrum"          
## [103] "Cornus"              "Alliaria"            "Rubus"              
## [106] "Isatis"              "Phleum"
dat2<-as.matrix(dat %>% select(Prunus:Phleum) %>% select(-any_of(singletonGenera)))
rownames(dat2)<-dat$BeeID

dat$ShannonD<-vegan::diversity(dat2, index="shannon")

hist(dat$ShannonD)

hist(as.numeric(dat$JDate))

hist(dat$NLCD1000_Developed)

hist(scale(dat$ShannonD))

hist(scale(as.numeric(dat$JDate)))

hist(scale(dat$NLCD1000_Developed))

polldivers.lmm <- nlme::lme(ShannonD~scale(NLCD1000_Developed)+scale(NLCD1000_Forest)+scale(NLCD1000_Herbaceous)+scale(NLCD1000_Crop)+scale(as.numeric(JDate))+scale(CapTimeHM), random=~1|Name, data=dat)
plot(polldivers.lmm)

summary(polldivers.lmm) #conditional t-test testing the marginal significance of each fixed effect coefficient with the other fixed effects in the model
## Linear mixed-effects model fit by REML
##   Data: dat 
##        AIC     BIC    logLik
##   129.5641 149.671 -55.78204
## 
## Random effects:
##  Formula: ~1 | Name
##         (Intercept)  Residual
## StdDev:  0.06317034 0.4533868
## 
## Fixed effects:  ShannonD ~ scale(NLCD1000_Developed) + scale(NLCD1000_Forest) +      scale(NLCD1000_Herbaceous) + scale(NLCD1000_Crop) + scale(as.numeric(JDate)) +      scale(CapTimeHM) 
##                                Value Std.Error DF   t-value p-value
## (Intercept)                0.7074114 0.0557090 58 12.698321  0.0000
## scale(NLCD1000_Developed)  0.2851386 0.8096789 11  0.352163  0.7314
## scale(NLCD1000_Forest)     0.3825655 0.7648605 11  0.500177  0.6268
## scale(NLCD1000_Herbaceous) 0.1904430 0.5330449 11  0.357274  0.7276
## scale(NLCD1000_Crop)       0.1234263 0.2944109 11  0.419231  0.6831
## scale(as.numeric(JDate))   0.1525481 0.0567882 58  2.686263  0.0094
## scale(CapTimeHM)           0.0096755 0.0679871 58  0.142314  0.8873
##  Correlation: 
##                            (Intr) s(NLCD1000_D s(NLCD1000_F s(NLCD1000_H
## scale(NLCD1000_Developed)   0.025                                       
## scale(NLCD1000_Forest)      0.024  0.996                                
## scale(NLCD1000_Herbaceous)  0.019  0.992        0.993                   
## scale(NLCD1000_Crop)        0.028  0.979        0.978        0.967      
## scale(as.numeric(JDate))    0.006 -0.201       -0.201       -0.225      
## scale(CapTimeHM)           -0.002  0.414        0.442        0.453      
##                            s(NLCD1000_C s(.(JD
## scale(NLCD1000_Developed)                     
## scale(NLCD1000_Forest)                        
## scale(NLCD1000_Herbaceous)                    
## scale(NLCD1000_Crop)                          
## scale(as.numeric(JDate))   -0.165             
## scale(CapTimeHM)            0.421       -0.021
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.7826417 -0.7307585 -0.1646942  0.7885067  1.9223805 
## 
## Number of Observations: 76
## Number of Groups: 16
stats::anova(polldivers.lmm) #conditional F-test testing the significance of terms in the fixed effects model (sequentially by default)
##                            numDF denDF   F-value p-value
## (Intercept)                    1    58 160.90101  <.0001
## scale(NLCD1000_Developed)      1    11   2.78950  0.1231
## scale(NLCD1000_Forest)         1    11   1.06107  0.3251
## scale(NLCD1000_Herbaceous)     1    11   0.38391  0.5481
## scale(NLCD1000_Crop)           1    11   0.76080  0.4017
## scale(as.numeric(JDate))       1    58   7.23560  0.0093
## scale(CapTimeHM)               1    58   0.02025  0.8873
summary<-summary(polldivers.lmm)
summary$tTable
##                                  Value  Std.Error DF    t-value      p-value
## (Intercept)                0.707411359 0.05570905 58 12.6983207 2.161841e-18
## scale(NLCD1000_Developed)  0.285138582 0.80967888 11  0.3521626 7.313706e-01
## scale(NLCD1000_Forest)     0.382565510 0.76486054 11  0.5001768 6.268057e-01
## scale(NLCD1000_Herbaceous) 0.190442951 0.53304490 11  0.3572738 7.276469e-01
## scale(NLCD1000_Crop)       0.123426281 0.29441093 11  0.4192313 6.831171e-01
## scale(as.numeric(JDate))   0.152548109 0.05678823 58  2.6862628 9.409566e-03
## scale(CapTimeHM)           0.009675501 0.06798714 58  0.1423137 8.873255e-01
plot(dat$JDate,dat$ShannonD)

dat %>%
  ggplot(aes(x=JDate, y=ShannonD))+
  geom_point(size=4)+
  stat_smooth(method=stats::lm, se=F)+
  theme_bw(base_size = 18)+
  labs(title="Pollen diversity over time",
       y="Shannon diversity index",
       x="Julian date")
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'

Redundancy analysis

#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
# environmental variables dataset
dat<- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"), 
  dat.Queen %>% select(Date:NestSeek), by="BeeID") %>% 
  left_join(., dat.Sites, by=c("Date", "SiteID")) %>% 
  mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
  mutate(CapTimeHM=hms::hms(hours=CapTimeH, minutes=CapTimeM, seconds=0), JDate=julian(Date)) %>%  #create variables
  select(Barcode, BeeID, SiteID, Name, 
         JDate, CapTimeHM, 
         Updated_y, NLCD1000_Water:NLCD1000_Crop, 
         Prunus:Phleum, 
         -"NA") %>%
#  filter(BeeID %in% TwoBarcodeBees$BeeID) %>% #optional
  filter(Name!="Blandy Experimental Farm") %>% #remove Blandy bees (REQUIRED)
  ungroup()

# explanatory variables must be centered, standardized (if explanatory variables are in different units), transformed (to limit the skew of explanatory variables) or normalized (to linearize relationships) following the same principles as in PCA

# global model2 should include: 
#NLCD1000_Forest + NLCD1000_Herbaceous + NLCD1000_Crop + NLCD1000_Developed + Barcode + Updated_y + JDate + CapTimeHM

#viewing distribution of explanatory, environmental variables 
hist(dat$NLCD1000_Herbaceous)

hist(dat$Updated_y)

hist(as.numeric(dat$JDate))

hist(scale(dat$NLCD1000_Herbaceous))

hist(scale(dat$Updated_y))

hist(scale(as.numeric(dat$JDate)))

# save center and scale of original Latitude (for plotting later, optional)
# center<-attr(scale(dat$Updated_y), "scaled:center")
# scale<-attr(scale(dat$Updated_y), "scaled:scale")

#transformations & standardization of environmental variables
dat<-dat %>% mutate(across(NLCD1000_Water:NLCD1000_Crop, ~scale(.)),
                    ) %>% as.data.frame()
dat$JDate<-scale(as.numeric(dat$JDate))
dat$Updated_y<-scale(as.numeric(dat$Updated_y))


# pollen community dataset
dat2<-as.data.frame(dat %>% 
    ungroup() %>%
    select(Prunus:Phleum) %>% 
    select(-any_of(singletonGenera))
      )
dat2<-as.matrix(dat2)
rownames(dat2)<-dat$BeeID

#standardization of pollen community data
dat2<-vegan::decostand(dat2,method = "hellinger") # community data standardization

library(vegan)
global.RDA.step<-ordiR2step(rda(dat2~1, data=dat), scope = rda(dat2 ~ NLCD1000_Forest + NLCD1000_Herbaceous + NLCD1000_Crop + NLCD1000_Developed + CapTimeHM + JDate + Updated_y + Barcode, data=dat))
## Step: R2.adj= 0 
## Call: dat2 ~ 1 
##  
##                       R2.adjusted
## <All variables>       0.269073139
## + JDate               0.084544164
## + NLCD1000_Crop       0.053466365
## + Barcode             0.046724941
## + NLCD1000_Developed  0.041377666
## + Updated_y           0.032432806
## + NLCD1000_Forest     0.021171414
## + CapTimeHM           0.009438771
## + NLCD1000_Herbaceous 0.007206977
## <none>                0.000000000
## 
##         Df     AIC      F Pr(>F)   
## + JDate  1 -18.232 7.9264  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.08454416 
## Call: dat2 ~ JDate 
##  
##                       R2.adjusted
## <All variables>        0.26907314
## + NLCD1000_Crop        0.14437415
## + Updated_y            0.13793315
## + NLCD1000_Developed   0.12312361
## + Barcode              0.11881977
## + NLCD1000_Forest      0.10685527
## + CapTimeHM            0.09442550
## + NLCD1000_Herbaceous  0.09065659
## <none>                 0.08454416
## 
##                 Df     AIC      F Pr(>F)   
## + NLCD1000_Crop  1 -22.403 6.1745  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.1443741 
## Call: dat2 ~ JDate + NLCD1000_Crop 
##  
##                       R2.adjusted
## <All variables>         0.2690731
## + Updated_y             0.1955759
## + NLCD1000_Developed    0.1792995
## + Barcode               0.1791862
## + NLCD1000_Forest       0.1718648
## + CapTimeHM             0.1549744
## + NLCD1000_Herbaceous   0.1543438
## <none>                  0.1443741
## 
##             Df     AIC      F Pr(>F)   
## + Updated_y  1 -26.141 5.6465  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.1955759 
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y 
##  
##                       R2.adjusted
## <All variables>         0.2690731
## + Barcode               0.2313515
## + NLCD1000_Herbaceous   0.2059329
## + NLCD1000_Developed    0.2057908
## + NLCD1000_Forest       0.1998819
## + CapTimeHM             0.1989565
## <none>                  0.1955759
## 
##           Df     AIC      F Pr(>F)   
## + Barcode  1 -28.662 4.3511  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.2313515 
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode 
##  
##                       R2.adjusted
## <All variables>         0.2690731
## + NLCD1000_Herbaceous   0.2412230
## + NLCD1000_Developed    0.2406660
## + NLCD1000_Forest       0.2362107
## + CapTimeHM             0.2352266
## <none>                  0.2313515
## 
##                       Df     AIC      F Pr(>F)  
## + NLCD1000_Herbaceous  1 -28.722 1.9237  0.028 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.241223 
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous 
##  
##                      R2.adjusted
## <All variables>        0.2690731
## + CapTimeHM            0.2475644
## + NLCD1000_Forest      0.2474796
## + NLCD1000_Developed   0.2474379
## <none>                 0.2412230
## 
##             Df     AIC    F Pr(>F)  
## + CapTimeHM  1 -28.453 1.59  0.046 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.2475644 
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous +      CapTimeHM 
##  
##                      R2.adjusted
## <All variables>        0.2690731
## + NLCD1000_Forest      0.2554377
## + NLCD1000_Developed   0.2552923
## <none>                 0.2475644
## 
##                   Df     AIC      F Pr(>F)  
## + NLCD1000_Forest  1 -28.362 1.7296  0.062 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#sample size calc
records<-dim(dat)[1] # rows
bees<-n_distinct(dat$BeeID) # bees
sites<-n_distinct(dat$Name) # sites

#best model of full dataset
RDA_LandCover<-vegan::rda(dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous, data=dat) #n=76 datapoints from 52 bees from 16 sites

#best model of subset (i.e., remove samples without both barcodes) n=48 datapoints from 24 bees from 12 sites
# RDA_LandCover<-vegan::rda(dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Developed + NLCD1000_Forest + NLCD1000_Herbaceous , data=dat)

RsquareAdj(RDA_LandCover) 
## $r.squared
## [1] 0.2918081
## 
## $adj.r.squared
## [1] 0.241223
anova.cca(RDA_LandCover, by="term")
## Permutation test for rda under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: rda(formula = dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous, data = dat)
##                     Df Variance      F Pr(>F)    
## JDate                1  0.08101 9.5631  0.001 ***
## NLCD1000_Crop        1  0.05898 6.9626  0.001 ***
## Updated_y            1  0.05071 5.9861  0.001 ***
## Barcode              1  0.03734 4.4077  0.001 ***
## NLCD1000_Herbaceous  1  0.01630 1.9237  0.043 *  
## Residual            70  0.59299                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(RDA_LandCover)[c(7:9,11)]
## $tot.chi
## [1] 0.8373357
## 
## $constr.chi
## [1] 0.2443414
## 
## $unconst.chi
## [1] 0.5929944
## 
## $concont
## $concont$importance
## Importance of components:
##                         RDA1    RDA2    RDA3    RDA4    RDA5
## Eigenvalue            0.1063 0.05512 0.04741 0.02338 0.01212
## Proportion Explained  0.4351 0.22557 0.19405 0.09567 0.04959
## Cumulative Proportion 0.4351 0.66069 0.85474 0.95041 1.00000
#anova.cca(RDA_LandCover, by="axis")
#vegan::ordiplot(RDA_LandCover, scaling=1, type="text")
# Scaling 1 shows similarities between objects in the response matrix.
# Sites (numbers) that are closer together have more similar communities.
# Species that are closer together occupy more sites in common.
RDA_plot_df<-vegan::ordiplot(RDA_LandCover, scaling=2, type = "text")

# Scaling 2 shows the effects of explanatory variables.
# Longer arrows mean this variable strongly drives the variation in the community matrix.
# Arrows pointing in opposite directions have a negative relationship.
# Arrows pointing in the same direction have a positive relationship.

#an RDA with x,x,x can explain about xx.x% of the total variation in pollen community composition. anova.cca() generate a significant p-value for this difference

#grabbing data from rda model (or the ordiplot thereof) for plotting in ggplot
#grabbing data from rda model (or the ordiplot thereof) for plotting in ggplot
sites.long <- BiodiversityR::sites.long(RDA_plot_df, env.data=dat)
species.long <- BiodiversityR::species.long(RDA_plot_df)
species.long2<- species.long %>% 
  mutate(diff=sqrt(abs(axis1)^2+abs(axis2)^2)) %>% 
  filter(diff>0.15) # select pollen species greater than X.XX units from origin (0,0)
axis.long <- BiodiversityR::axis.long(RDA_LandCover, choices=c(1, 2))

arrows <- as.data.frame(vegan::scores(RDA_LandCover)$biplot) #
arrows$names<-rownames(arrows)
arrows <- arrows %>%        # select non-County arrows greater than 0.2 units from 0,0
  mutate(diff=sqrt(abs(RDA1)^2+abs(RDA2)^2)) %>% 
  filter(!str_starts(names,"Barcode"), #represented as color and shape
         diff>0.15)


colnames(arrows)<-c("axis1","axis2","labels","diff")
rbind(species.long2, arrows)
##                            axis1       axis2              labels      diff
## Prunus              -0.773127745  0.27097049              Prunus 0.8192384
## Lamium               0.338261530  0.52213164              Lamium 0.6221272
## Cercis              -0.008520592 -0.27520275              Cercis 0.2753346
## Acer                -0.258346269 -0.15696354                Acer 0.3022918
## Vaccinium            0.048628194 -0.16651499           Vaccinium 0.1734703
## Pinus                0.306146504  0.01898303               Pinus 0.3067345
## Salix                0.222940702 -0.07908980               Salix 0.2365539
## JDate                0.812812428 -0.19866066               JDate 0.8367378
## NLCD1000_Crop        0.283497851  0.79727300       NLCD1000_Crop 0.8461769
## Updated_y           -0.095481519  0.16985197           Updated_y 0.1948497
## NLCD1000_Herbaceous  0.162970704 -0.06617047 NLCD1000_Herbaceous 0.1758920
#sites.long$Updated_y_descaled<-as.numeric(unlist(ggfortify::unscale(dat$Updated_y, center=center, scale=scale)))

plot2<-ggplot() + 
    geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
    geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +  
    labs(x=axis.long[1, "label"], y=axis.long[2, "label"]) +  
    scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
    scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) + 
    geom_point(data=sites.long, aes(x=axis1, y=axis2, shape=Barcode, color=Barcode), size=5, alpha=0.7) +
    geom_point(data=species.long, aes(x=axis1, y=axis2), size=3, alpha=0.7) +
    ggrepel::geom_text_repel(data=rbind(species.long2, arrows), aes(x = axis1, y = axis2, label = labels), size=5, box.padding = 0.75, direction = "both") +
    geom_segment(data=arrows, aes(x=0, xend=axis1, y=0, yend=axis2), arrow=arrow(length=unit(0.25, "cm")), colour="black") +
    coord_fixed(ratio=1) +
  theme_bw(base_size = 18)+
  labs(title=paste("Land cover differences (n=", records, " records from ", bees, " bees, ", sites, " sites)", sep=""))
plot2
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggpubr::ggarrange(plot1, plot2, ncol = 2, nrow = 1)
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Misc Summary/Counts

summary of queen obs

# total queens observed
dat<-dat.Sites%>%filter(Name!="Blandy Experimental Farm")
sum(dat.Sites$TotalCaptured + dat.Sites$TotalCaptured) #total
## [1] 1166
sum(dat$TotalCaptured + dat$TotalCaptured) #excluding blandy
## [1] 690
# bombus queen richness
unique(dat.Queen$Bombus_spp)
## [1] fervidus                    bimaculatus                
## [3] griseocollis                vagans-sandersoni-perplexus
## [5] impatiens                   pensylvanicus              
## [7] auricomus                  
## 7 Levels: auricomus bimaculatus fervidus griseocollis ... vagans-sandersoni-perplexus
# by species, a count of bees, date of first detect, first pollen, first nest
left_join(
dat.Queen %>% 
  group_by(Bombus_spp) %>% 
  summarise(
    Total=n(),
    FirstDetect=min(Date)
  ),
dat.Queen %>% 
  group_by(Bombus_spp) %>% 
  filter(CarryPollen==1) %>%
  summarise(
    CarryPollen=n(),
    FirstPollen=min(Date)
  ),
by="Bombus_spp") %>% 
  left_join(.,
dat.Queen %>% 
  group_by(Bombus_spp) %>% 
  filter(NestSeek==1) %>%
  summarise(
    NestSeek=n(),
    FirstNestSeek=min(Date))
  ,
by="Bombus_spp") %>% 
  left_join(.,
dat.Queen %>% 
  group_by(Bombus_spp) %>% 
  filter(Forage==1) %>%
  summarise(
    Forage=n(),
    FirstForage=min(Date))
  ,
by="Bombus_spp")
## # A tibble: 7 × 9
##   Bombus_spp  Total FirstDetect         CarryPollen FirstPollen         NestSeek
##   <fct>       <int> <dttm>                    <int> <dttm>                 <int>
## 1 auricomus       3 2022-04-28 00:00:00           3 2022-04-28 00:00:00       NA
## 2 bimaculatus    56 2022-03-26 00:00:00          56 2022-03-26 00:00:00       NA
## 3 fervidus        8 2022-04-25 00:00:00           8 2022-04-25 00:00:00       NA
## 4 griseocoll…    26 2022-04-16 00:00:00          26 2022-04-16 00:00:00       NA
## 5 impatiens      11 2022-03-22 00:00:00          11 2022-03-22 00:00:00        1
## 6 pensylvani…     1 2022-04-24 00:00:00           1 2022-04-24 00:00:00       NA
## 7 vagans-san…     2 2022-04-13 00:00:00           2 2022-04-13 00:00:00       NA
## # ℹ 3 more variables: FirstNestSeek <dttm>, Forage <int>, FirstForage <dttm>
dat.Queen %>% group_by(CarryPollen,Bombus_spp) %>% #number of queens with pollen by species
  summarise(Count=n())
## `summarise()` has grouped output by 'CarryPollen'. You can override using the
## `.groups` argument.
## # A tibble: 7 × 3
## # Groups:   CarryPollen [1]
##   CarryPollen Bombus_spp                  Count
##         <int> <fct>                       <int>
## 1           1 auricomus                       3
## 2           1 bimaculatus                    56
## 3           1 fervidus                        8
## 4           1 griseocollis                   26
## 5           1 impatiens                      11
## 6           1 pensylvanicus                   1
## 7           1 vagans-sandersoni-perplexus     2
#filter out blandy
dat.Queen %>% filter(!str_starts(SiteID, "BEF")) %>% 
  group_by(Bombus_spp) %>% #number of queens with pollen by species
  summarise(Count=n())
## # A tibble: 6 × 2
##   Bombus_spp                  Count
##   <fct>                       <int>
## 1 bimaculatus                    30
## 2 fervidus                        5
## 3 griseocollis                    7
## 4 impatiens                       7
## 5 pensylvanicus                   1
## 6 vagans-sandersoni-perplexus     2
dat.Queen %>% filter(!str_starts(SiteID, "BEF")) %>% 
  group_by(CarryPollen,Bombus_spp)%>% #number of queens with pollen by species
  summarise(Count=n())
## `summarise()` has grouped output by 'CarryPollen'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups:   CarryPollen [1]
##   CarryPollen Bombus_spp                  Count
##         <int> <fct>                       <int>
## 1           1 bimaculatus                    30
## 2           1 fervidus                        5
## 3           1 griseocollis                    7
## 4           1 impatiens                       7
## 5           1 pensylvanicus                   1
## 6           1 vagans-sandersoni-perplexus     2
dat.Queen %>% filter(!str_starts(SiteID, "BEF")) %>% 
  group_by(NestSeek, Bombus_spp)%>%  #number of queens nest seeking
  summarise(Count=n())
## `summarise()` has grouped output by 'NestSeek'. You can override using the
## `.groups` argument.
## # A tibble: 7 × 3
## # Groups:   NestSeek [2]
##   NestSeek Bombus_spp                  Count
##      <dbl> <fct>                       <int>
## 1        0 bimaculatus                    30
## 2        0 fervidus                        5
## 3        0 griseocollis                    7
## 4        0 impatiens                       6
## 5        0 pensylvanicus                   1
## 6        0 vagans-sandersoni-perplexus     2
## 7        1 impatiens                       1

survey site list

dat.Sites %>% select(SiteID, bimaculatus:pensylvanicus) %>%
  pivot_longer(cols=c(bimaculatus:pensylvanicus), names_to = "Bombus_spp", values_to = "count") %>%
  filter(count>0) %>%
  group_by(SiteID) %>%
  summarize(BeeRichness=n()) %>%
  left_join(., dat.Sites, by="SiteID") %>%
  select(Name, SiteID, County, Type, Updated_x, Updated_y, Date, FloralRichness, Bees_60min, BeeRichness)
## # A tibble: 70 × 10
##    Name              SiteID County Type  Updated_x Updated_y Date               
##    <chr>             <fct>  <chr>  <chr>     <dbl>     <dbl> <dttm>             
##  1 Blandy Experimen… BEF_01 Clarke Loca…     -78.1      39.1 2022-03-30 00:00:00
##  2 Blandy Experimen… BEF_02 Clarke Loca…     -78.1      39.1 2022-03-31 00:00:00
##  3 Blandy Experimen… BEF_03 Clarke Loca…     -78.1      39.1 2022-04-01 00:00:00
##  4 Blandy Experimen… BEF_04 Clarke Loca…     -78.1      39.1 2022-04-04 00:00:00
##  5 Blandy Experimen… BEF_06 Clarke Loca…     -78.1      39.1 2022-04-08 00:00:00
##  6 Blandy Experimen… BEF_09 Clarke Loca…     -78.1      39.1 2022-04-11 00:00:00
##  7 Blandy Experimen… BEF_11 Clarke Loca…     -78.1      39.1 2022-04-13 00:00:00
##  8 Blandy Experimen… BEF_12 Clarke Loca…     -78.1      39.1 2022-04-14 00:00:00
##  9 Blandy Experimen… BEF_14 Clarke Loca…     -78.1      39.1 2022-04-15 00:00:00
## 10 Blandy Experimen… BEF_15 Clarke Loca…     -78.1      39.1 2022-04-20 00:00:00
## # ℹ 60 more rows
## # ℹ 3 more variables: FloralRichness <dbl>, Bees_60min <dbl>, BeeRichness <int>

avg distance bw sites

dat<-dat.Sites %>% select(Name, Updated_x, Updated_y) %>% distinct()
dist<-raster::pointDistance(dat[, c("Updated_x", "Updated_y")], lonlat=TRUE)
dist[dist==0]<-NA

mean(dist, na.rm = T)
## [1] 55327.82
min(dist, na.rm = T)
## [1] 1751.81
max(dist, na.rm = T)
## [1] 141190.9
rm(dist)